diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..80f09898 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-07-17T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.08701v1","updated":"2023-07-17T17:59:40Z","published":"2023-07-17T17:59:40Z","title":"AlpaGasus: Training A Better Alpaca with Fewer Data","summary":" Large language models~(LLMs) obtain instruction-following capability through\ninstruction-finetuning (IFT) on supervised instruction/response data. However,\nwidely used IFT datasets (e.g., Alpaca's 52k data) surprisingly contain many\nlow-quality instances with incorrect or irrelevant responses, which are\nmisleading and detrimental to IFT. In this paper, we propose a simple and\neffective data selection strategy that automatically identifies and removes\nlow-quality data using a strong LLM (e.g., ChatGPT). To this end, we introduce\nAlpaGasus, which is finetuned on only 9k high-quality data filtered from the\n52k Alpaca data. AlpaGasus significantly outperforms the original Alpaca as\nevaluated by GPT-4 on multiple test sets and its 13B variant matches $>90\\%$\nperformance of its teacher LLM (i.e., Text-Davinci-003) on test tasks. It also\nprovides 5.7x faster training, reducing the training time for a 7B variant from\n80 minutes (for Alpaca) to 14 minutes \\footnote{We apply IFT for the same\nnumber of epochs as Alpaca(7B) but on fewer data, using 4$\\times$NVIDIA A100\n(80GB) GPUs and following the original Alpaca setting and hyperparameters.}.\nOverall, AlpaGasus demonstrates a novel data-centric IFT paradigm that can be\ngenerally applied to instruction-tuning data, leading to faster training and\nbetter instruction-following models. Our project page is available at:\n\\url{https://lichang-chen.github.io/AlpaGasus/}.\n","authors":["Lichang Chen","Shiyang Li","Jun Yan","Hai Wang","Kalpa Gunaratna","Vikas Yadav","Zheng Tang","Vijay Srinivasan","Tianyi Zhou","Heng Huang","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2307.08701v1.pdf","comment":"22 pages; 22 figures"},{"id":"http://arxiv.org/abs/2210.00131v3","updated":"2023-07-17T17:56:10Z","published":"2022-09-30T23:10:11Z","title":"Underspecification in Language Modeling Tasks: A Causality-Informed\n Study of Gendered Pronoun Resolution","summary":" Modern language modeling tasks are often underspecified: for a given token\nprediction, many words may satisfy the user's intent of producing natural\nlanguage at inference time, however only one word would minimize the task's\nloss function at training time. We provide a simple yet plausible causal\nmechanism describing the role underspecification plays in the generation of\nspurious correlations. Despite its simplicity, our causal model directly\ninforms the development of two lightweight black-box evaluation methods, that\nwe apply to gendered pronoun resolution tasks on a wide range of LLMs to 1) aid\nin the detection of inference-time task underspecification by exploiting 2)\npreviously unreported gender vs. time and gender vs. location spurious\ncorrelations on LLMs with a range of A) sizes: from BERT-base to GPT 3.5, B)\npre-training objectives: from masked & autoregressive language modeling to a\nmixture of these objectives, and C) training stages: from pre-training only to\nreinforcement learning from human feedback (RLHF). Code and open-source demos\navailable at https: //github.com/2dot71mily/sib_paper.\n","authors":["Emily McMilin"],"pdf_url":"https://arxiv.org/pdf/2210.00131v3.pdf","comment":"25 pages, 34 figures"},{"id":"http://arxiv.org/abs/2307.08689v1","updated":"2023-07-17T17:48:51Z","published":"2023-07-17T17:48:51Z","title":"COLLIE: Systematic Construction of Constrained Text Generation Tasks","summary":" Text generation under constraints have seen increasing interests in natural\nlanguage processing, especially with the rapidly improving capabilities of\nlarge language models. However, existing benchmarks for constrained generation\nusually focus on fixed constraint types (e.g.,generate a sentence containing\ncertain words) that have proved to be easy for state-of-the-art models like\nGPT-4. We present COLLIE, a grammar-based framework that allows the\nspecification of rich, compositional constraints with diverse generation levels\n(word, sentence, paragraph, passage) and modeling challenges (e.g.,language\nunderstanding, logical reasoning, counting, semantic planning). We also develop\ntools for automatic extraction of task instances given a constraint structure\nand a raw text corpus. Using COLLIE, we compile the COLLIE-v1 dataset with 2080\ninstances comprising 13 constraint structures. We perform systematic\nexperiments across five state-of-the-art instruction-tuned language models and\nanalyze their performances to reveal shortcomings. COLLIE is designed to be\nextensible and lightweight, and we hope the community finds it useful to\ndevelop more complex constraints and evaluations in the future.\n","authors":["Shunyu Yao","Howard Chen","Austin W. Hanjie","Runzhe Yang","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2307.08689v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.08678v1","updated":"2023-07-17T17:41:47Z","published":"2023-07-17T17:41:47Z","title":"Do Models Explain Themselves? Counterfactual Simulatability of Natural\n Language Explanations","summary":" Large language models (LLMs) are trained to imitate humans to explain human\ndecisions. However, do LLMs explain themselves? Can they help humans build\nmental models of how LLMs process different inputs? To answer these questions,\nwe propose to evaluate $\\textbf{counterfactual simulatability}$ of natural\nlanguage explanations: whether an explanation can enable humans to precisely\ninfer the model's outputs on diverse counterfactuals of the explained input.\nFor example, if a model answers \"yes\" to the input question \"Can eagles fly?\"\nwith the explanation \"all birds can fly\", then humans would infer from the\nexplanation that it would also answer \"yes\" to the counterfactual input \"Can\npenguins fly?\". If the explanation is precise, then the model's answer should\nmatch humans' expectations.\n We implemented two metrics based on counterfactual simulatability: precision\nand generality. We generated diverse counterfactuals automatically using LLMs.\nWe then used these metrics to evaluate state-of-the-art LLMs (e.g., GPT-4) on\ntwo tasks: multi-hop factual reasoning and reward modeling. We found that LLM's\nexplanations have low precision and that precision does not correlate with\nplausibility. Therefore, naively optimizing human approvals (e.g., RLHF) may\nnot be a sufficient solution.\n","authors":["Yanda Chen","Ruiqi Zhong","Narutatsu Ri","Chen Zhao","He He","Jacob Steinhardt","Zhou Yu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2307.08678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.07533v4","updated":"2023-07-17T17:16:42Z","published":"2021-11-15T04:44:57Z","title":"Automated scholarly paper review: Concepts, technologies, and challenges","summary":" Peer review is a widely accepted mechanism for research evaluation, playing a\npivotal role in academic publishing. However, criticisms have long been leveled\nat this mechanism, mostly because of its poor efficiency and low\nreproducibility. Recent years have seen the application of artificial\nintelligence (AI) in assisting the peer review process. Nonetheless, with the\ninvolvement of humans, such limitations remain inevitable. In this paper, we\npropose the concept and pipeline of automated scholarly paper review (ASPR) and\nreview the relevant literature and technologies of achieving a full-scale\ncomputerized review process. On the basis of the review and discussion, we\nconclude that there is already corresponding research and preliminary\nimplementation at each stage of ASPR. We further look into the challenges in\nASPR with the existing technologies. The major difficulties lie in inadequate\ndata, imperfect document parsing and representation, defective\nhuman$\\unicode{x2013}$computer interaction, and flawed deep logical reasoning.\nMoreover, we point out the future directions and discuss the possible moral and\nethical issues of ASPR. In the foreseeable future, ASPR and peer review will\ncoexist in a reinforcing manner before ASPR is able to fully undertake the\nreviewing workload from humans.\n","authors":["Jialiang Lin","Jiaxin Song","Zhangping Zhou","Yidong Chen","Xiaodong Shi"],"pdf_url":"https://arxiv.org/pdf/2111.07533v4.pdf","comment":"Please cite the version of Information Fusion"},{"id":"http://arxiv.org/abs/2307.08655v1","updated":"2023-07-17T17:12:44Z","published":"2023-07-17T17:12:44Z","title":"Multilingual Speech-to-Speech Translation into Multiple Target Languages","summary":" Speech-to-speech translation (S2ST) enables spoken communication between\npeople talking in different languages. Despite a few studies on multilingual\nS2ST, their focus is the multilinguality on the source side, i.e., the\ntranslation from multiple source languages to one target language. We present\nthe first work on multilingual S2ST supporting multiple target languages.\nLeveraging recent advance in direct S2ST with speech-to-unit and vocoder, we\nequip these key components with multilingual capability. Speech-to-masked-unit\n(S2MU) is the multilingual extension of S2U, which applies masking to units\nwhich don't belong to the given target language to reduce the language\ninterference. We also propose multilingual vocoder which is trained with\nlanguage embedding and the auxiliary loss of language identification. On\nbenchmark translation testsets, our proposed multilingual model shows superior\nperformance than bilingual models in the translation from English into $16$\ntarget languages.\n","authors":["Hongyu Gong","Ning Dong","Sravya Popuri","Vedanuj Goswami","Ann Lee","Juan Pino"],"pdf_url":"https://arxiv.org/pdf/2307.08655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08621v1","updated":"2023-07-17T16:40:01Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":" In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08597v1","updated":"2023-07-17T16:07:07Z","published":"2023-07-17T16:07:07Z","title":"Multimodal Diffusion Segmentation Model for Object Segmentation from\n Manipulation Instructions","summary":" In this study, we aim to develop a model that comprehends a natural language\ninstruction (e.g., \"Go to the living room and get the nearest pillow to the\nradio art on the wall\") and generates a segmentation mask for the target\neveryday object. The task is challenging because it requires (1) the\nunderstanding of the referring expressions for multiple objects in the\ninstruction, (2) the prediction of the target phrase of the sentence among the\nmultiple phrases, and (3) the generation of pixel-wise segmentation masks\nrather than bounding boxes. Studies have been conducted on languagebased\nsegmentation methods; however, they sometimes mask irrelevant regions for\ncomplex sentences. In this paper, we propose the Multimodal Diffusion\nSegmentation Model (MDSM), which generates a mask in the first stage and\nrefines it in the second stage. We introduce a crossmodal parallel feature\nextraction mechanism and extend diffusion probabilistic models to handle\ncrossmodal features. To validate our model, we built a new dataset based on the\nwell-known Matterport3D and REVERIE datasets. This dataset consists of\ninstructions with complex referring expressions accompanied by real indoor\nenvironmental images that feature various target objects, in addition to\npixel-wise segmentation masks. The performance of MDSM surpassed that of the\nbaseline method by a large margin of +10.13 mean IoU.\n","authors":["Yui Iioka","Yu Yoshida","Yuiga Wada","Shumpei Hatanaka","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2307.08597v1.pdf","comment":"Accepted for presentation at IROS2023"},{"id":"http://arxiv.org/abs/2307.08586v1","updated":"2023-07-17T15:58:05Z","published":"2023-07-17T15:58:05Z","title":"Syntax-Aware Complex-Valued Neural Machine Translation","summary":" Syntax has been proven to be remarkably effective in neural machine\ntranslation (NMT). Previous models obtained syntax information from syntactic\nparsing tools and integrated it into NMT models to improve translation\nperformance. In this work, we propose a method to incorporate syntax\ninformation into a complex-valued Encoder-Decoder architecture. The proposed\nmodel jointly learns word-level and syntax-level attention scores from the\nsource side to the target side using an attention mechanism. Importantly, it is\nnot dependent on specific network architectures and can be directly integrated\ninto any existing sequence-to-sequence (Seq2Seq) framework. The experimental\nresults demonstrate that the proposed method can bring significant improvements\nin BLEU scores on two datasets. In particular, the proposed method achieves a\ngreater improvement in BLEU scores in translation tasks involving language\npairs with significant syntactic differences.\n","authors":["Yang Liu","Yuexian Hou"],"pdf_url":"https://arxiv.org/pdf/2307.08586v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.08580v1","updated":"2023-07-17T15:49:35Z","published":"2023-07-17T15:49:35Z","title":"The Resume Paradox: Greater Language Differences, Smaller Pay Gaps","summary":" Over the past decade, the gender pay gap has remained steady with women\nearning 84 cents for every dollar earned by men on average. Many studies\nexplain this gap through demand-side bias in the labor market represented\nthrough employers' job postings. However, few studies analyze potential bias\nfrom the worker supply-side. Here, we analyze the language in millions of US\nworkers' resumes to investigate how differences in workers' self-representation\nby gender compare to differences in earnings. Across US occupations, language\ndifferences between male and female resumes correspond to 11% of the variation\nin gender pay gap. This suggests that females' resumes that are semantically\nsimilar to males' resumes may have greater wage parity. However, surprisingly,\noccupations with greater language differences between male and female resumes\nhave lower gender pay gaps. A doubling of the language difference between\nfemale and male resumes results in an annual wage increase of $2,797 for the\naverage female worker. This result holds with controls for gender-biases of\nresume text and we find that per-word bias poorly describes the variance in\nwage gap. The results demonstrate that textual data and self-representation are\nvaluable factors for improving worker representations and understanding\nemployment inequities.\n","authors":["Joshua R. Minot","Marc Maier","Bradford Demarest","Nicholas Cheney","Christopher M. Danforth","Peter Sheridan Dodds","Morgan R. Frank"],"pdf_url":"https://arxiv.org/pdf/2307.08580v1.pdf","comment":"24 pages, 15 figures"},{"id":"http://arxiv.org/abs/2307.08541v1","updated":"2023-07-17T15:00:04Z","published":"2023-07-17T15:00:04Z","title":"Discovering collective narratives shifts in online discussions","summary":" Narrative is a foundation of human cognition and decision making. Because\nnarratives play a crucial role in societal discourses and spread of\nmisinformation and because of the pervasive use of social media, the narrative\ndynamics on social media can have profound societal impact. Yet, systematic and\ncomputational understanding of online narratives faces critical challenge of\nthe scale and dynamics; how can we reliably and automatically extract\nnarratives from massive amount of texts? How do narratives emerge, spread, and\ndie? Here, we propose a systematic narrative discovery framework that fill this\ngap by combining change point detection, semantic role labeling (SRL), and\nautomatic aggregation of narrative fragments into narrative networks. We\nevaluate our model with synthetic and empirical data two-Twitter corpora about\nCOVID-19 and 2017 French Election. Results demonstrate that our approach can\nrecover major narrative shifts that correspond to the major events.\n","authors":["Wanying Zhao","Fiona Guo","Kristina Lerman","Yong-Yeol Ahn"],"pdf_url":"https://arxiv.org/pdf/2307.08541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08487v1","updated":"2023-07-17T13:49:52Z","published":"2023-07-17T13:49:52Z","title":"Latent Jailbreak: A Benchmark for Evaluating Text Safety and Output\n Robustness of Large Language Models","summary":" Researchers have invested considerable effort into ensuring that large\nlanguage models (LLMs) align with human values, using various training\ntechniques, such as instruction tuning and Reinforcement Learning from Human or\nAI Feedback (RLHF/RLAIF), to guard against text unsafety. However, these\ndefenses remain incredibly vulnerable to some jailbreak attacks, which can\ncause the model to become overly defensive to sensitive topics or still\ngenerate harmful content, leaving the model performance particularly fragile.\nTherefore, to comprehensively study text safety and output robustness, we\npropose a latent jailbreak prompt dataset, each involving malicious instruction\nembedding. Specifically, we instruct the model to complete a regular task, such\nas translation, where the text to be translated contains malicious\ninstructions. To further analyze the safety and robustness, we design a\nhierarchical annotation framework. We present a systematic analysis of the\nsafety and robustness of LLMs concerning the position of explicit normal\ninstructions, word replacement (verbs in explicit normal instructions, target\ngroups in malicious instructions, cue words in malicious instructions), and\ninstruction replacement (different explicit normal instructions). Our results\nshow that current LLMs not only have a preference for certain instruction\nverbs, but also exhibit different jailbreak rates for different instruction\nverbs in explicit normal instructions. In other words, the probability of\ngenerating unsafe content by the model will be reinforced to varying degrees\ndepending on the instruction verb in explicit normal instructions. Code and\ndata are available at https://github.com/qiuhuachuan/latent-jailbreak.\n","authors":["Huachuan Qiu","Shuai Zhang","Anqi Li","Hongliang He","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2307.08487v1.pdf","comment":"Code and data are available at\n https://github.com/qiuhuachuan/latent-jailbreak"},{"id":"http://arxiv.org/abs/2307.06699v2","updated":"2023-07-17T12:21:55Z","published":"2023-07-13T11:55:03Z","title":"Parmesan: mathematical concept extraction for education","summary":" Mathematics is a highly specialized domain with its own unique set of\nchallenges that has seen limited study in natural language processing. However,\nmathematics is used in a wide variety of fields and multidisciplinary research\nin many different domains often relies on an understanding of mathematical\nconcepts. To aid researchers coming from other fields, we develop a prototype\nsystem for searching for and defining mathematical concepts in context,\nfocusing on the field of category theory. This system, Parmesan, depends on\nnatural language processing components including concept extraction, relation\nextraction, definition extraction, and entity linking. In developing this\nsystem, we show that existing techniques cannot be applied directly to the\ncategory theory domain, and suggest hybrid techniques that do perform well,\nthough we expect the system to evolve over time. We also provide two cleaned\nmathematical corpora that power the prototype system, which are based on\njournal articles and wiki pages, respectively. The corpora have been annotated\nwith dependency trees, lemmas, and part-of-speech tags.\n","authors":["Jacob Collard","Valeria de Paiva","Eswaran Subrahmanian"],"pdf_url":"https://arxiv.org/pdf/2307.06699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08426v1","updated":"2023-07-17T12:14:45Z","published":"2023-07-17T12:14:45Z","title":"Improving End-to-End Speech Translation by Imitation-Based Knowledge\n Distillation with Synthetic Transcripts","summary":" End-to-end automatic speech translation (AST) relies on data that combines\naudio inputs with text translation outputs. Previous work used existing large\nparallel corpora of transcriptions and translations in a knowledge distillation\n(KD) setup to distill a neural machine translation (NMT) into an AST student\nmodel. While KD allows using larger pretrained models, the reliance of previous\nKD approaches on manual audio transcripts in the data pipeline restricts the\napplicability of this framework to AST. We present an imitation learning\napproach where a teacher NMT system corrects the errors of an AST student\nwithout relying on manual transcripts. We show that the NMT teacher can recover\nfrom errors in automatic transcriptions and is able to correct erroneous\ntranslations of the AST student, leading to improvements of about 4 BLEU points\nover the standard AST end-to-end baseline on the English-German CoVoST-2 and\nMuST-C datasets, respectively. Code and data are publicly\navailable.\\footnote{\\url{https://github.com/HubReb/imitkd_ast/releases/tag/v1.1}}\n","authors":["Rebekka Hubert","Artem Sokolov","Stefan Riezler"],"pdf_url":"https://arxiv.org/pdf/2307.08426v1.pdf","comment":"IWSLT 2023, corrected version"},{"id":"http://arxiv.org/abs/2307.08416v1","updated":"2023-07-17T11:56:32Z","published":"2023-07-17T11:56:32Z","title":"Enhancing Supervised Learning with Contrastive Markings in Neural\n Machine Translation Training","summary":" Supervised learning in Neural Machine Translation (NMT) typically follows a\nteacher forcing paradigm where reference tokens constitute the conditioning\ncontext in the model's prediction, instead of its own previous predictions. In\norder to alleviate this lack of exploration in the space of translations, we\npresent a simple extension of standard maximum likelihood estimation by a\ncontrastive marking objective. The additional training signals are extracted\nautomatically from reference translations by comparing the system hypothesis\nagainst the reference, and used for up/down-weighting correct/incorrect tokens.\nThe proposed new training procedure requires one additional translation pass\nover the training set per epoch, and does not alter the standard inference\nsetup. We show that training with contrastive markings yields improvements on\ntop of supervised learning, and is especially useful when learning from\npostedits where contrastive markings indicate human error corrections to the\noriginal hypotheses. Code is publicly released.\n","authors":["Nathaniel Berger","Miriam Exel","Matthias Huck","Stefan Riezler"],"pdf_url":"https://arxiv.org/pdf/2307.08416v1.pdf","comment":"Proceedings of the 24th Annual Conference of the European Association\n for Machine Translation, p. 69-78 Tampere, Finland, June 2023"},{"id":"http://arxiv.org/abs/2307.08393v1","updated":"2023-07-17T11:12:56Z","published":"2023-07-17T11:12:56Z","title":"On the application of Large Language Models for language teaching and\n assessment technology","summary":" The recent release of very large language models such as PaLM and GPT-4 has\nmade an unprecedented impact in the popular media and public consciousness,\ngiving rise to a mixture of excitement and fear as to their capabilities and\npotential uses, and shining a light on natural language processing research\nwhich had not previously received so much attention. The developments offer\ngreat promise for education technology, and in this paper we look specifically\nat the potential for incorporating large language models in AI-driven language\nteaching and assessment systems. We consider several research areas and also\ndiscuss the risks and ethical considerations surrounding generative AI in\neducation technology for language learners. Overall we find that larger\nlanguage models offer improvements over previous models in text generation,\nopening up routes toward content generation which had not previously been\nplausible. For text generation they must be prompted carefully and their\noutputs may need to be reshaped before they are ready for use. For automated\ngrading and grammatical error correction, tasks whose progress is checked on\nwell-known benchmarks, early investigations indicate that large language models\non their own do not improve on state-of-the-art results according to standard\nevaluation metrics. For grading it appears that linguistic features established\nin the literature should still be used for best performance, and for error\ncorrection it may be that the models can offer alternative feedback styles\nwhich are not measured sensitively with existing methods. In all cases, there\nis work to be done to experiment with the inclusion of large language models in\neducation technology for language learners, in order to properly understand and\nreport on their capacities and limitations, and to ensure that foreseeable\nrisks such as misinformation and harmful bias are mitigated.\n","authors":["Andrew Caines","Luca Benedetto","Shiva Taslimipoor","Christopher Davis","Yuan Gao","Oeistein Andersen","Zheng Yuan","Mark Elliott","Russell Moore","Christopher Bryant","Marek Rei","Helen Yannakoudakis","Andrew Mullooly","Diane Nicholls","Paula Buttery"],"pdf_url":"https://arxiv.org/pdf/2307.08393v1.pdf","comment":"Accepted at the AIED2023 workshop: Empowering Education with LLMs -\n the Next-Gen Interface and Content Generation"},{"id":"http://arxiv.org/abs/2307.08368v1","updated":"2023-07-17T10:06:21Z","published":"2023-07-17T10:06:21Z","title":"Gender mobility in the labor market with skills-based matching models","summary":" Skills-based matching promises mobility of workers between different sectors\nand occupations in the labor market. In this case, job seekers can look for\njobs they do not yet have experience in, but for which they do have relevant\nskills. Currently, there are multiple occupations with a skewed gender\ndistribution. For skills-based matching, it is unclear if and how a shift in\nthe gender distribution, which we call gender mobility, between occupations\nwill be effected. It is expected that the skills-based matching approach will\nlikely be data-driven, including computational language models and supervised\nlearning methods.\n This work, first, shows the presence of gender segregation in language\nmodel-based skills representation of occupations. Second, we assess the use of\nthese representations in a potential application based on simulated data, and\nshow that the gender segregation is propagated by various data-driven\nskills-based matching models.These models are based on different language\nrepresentations (bag of words, word2vec, and BERT), and distance metrics\n(static and machine learning-based). Accordingly, we show how skills-based\nmatching approaches can be evaluated and compared on matching performance as\nwell as on the risk of gender segregation. Making the gender segregation bias\nof models more explicit can help in generating healthy trust in the use of\nthese models in practice.\n","authors":["Ajaya Adhikari","Steven Vethman","Daan Vos","Marc Lenz","Ioana Cocu","Ioannis Tolios","Cor J. Veenman"],"pdf_url":"https://arxiv.org/pdf/2307.08368v1.pdf","comment":"This paper was presented during the AAAI Spring Symposium 2023 (AI\n Trustworthiness Assessment (AITA) track)"},{"id":"http://arxiv.org/abs/2211.16198v3","updated":"2023-07-17T09:24:49Z","published":"2022-11-28T16:48:41Z","title":"SuS-X: Training-Free Name-Only Transfer of Vision-Language Models","summary":" Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet\neffective way to train large-scale vision-language models. CLIP demonstrates\nimpressive zero-shot classification and retrieval on diverse downstream tasks.\nHowever, to leverage its full potential, fine-tuning still appears to be\nnecessary. Fine-tuning the entire CLIP model can be resource-intensive and\nunstable. Moreover, recent methods that aim to circumvent this need for\nfine-tuning still require access to images from the target distribution. In\nthis paper, we pursue a different approach and explore the regime of\ntraining-free \"name-only transfer\" in which the only knowledge we possess about\nthe downstream task comprises the names of downstream target categories. We\npropose a novel method, SuS-X, consisting of two key building blocks -- SuS and\nTIP-X, that requires neither intensive fine-tuning nor costly labelled data.\nSuS-X achieves state-of-the-art zero-shot classification results on 19\nbenchmark datasets. We further show the utility of TIP-X in the training-free\nfew-shot setting, where we again achieve state-of-the-art results over strong\ntraining-free baselines. Code is available at\nhttps://github.com/vishaal27/SuS-X.\n","authors":["Vishaal Udandarao","Ankush Gupta","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2211.16198v3.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2305.04195v2","updated":"2023-07-17T08:38:53Z","published":"2023-05-07T05:40:48Z","title":"Cross-Modal Retrieval for Motion and Text via MildTriple Loss","summary":" Cross-modal retrieval has become a prominent research topic in computer\nvision and natural language processing with advances made in image-text and\nvideo-text retrieval technologies. However, cross-modal retrieval between human\nmotion sequences and text has not garnered sufficient attention despite the\nextensive application value it holds, such as aiding virtual reality\napplications in better understanding users' actions and language. This task\npresents several challenges, including joint modeling of the two modalities,\ndemanding the understanding of person-centered information from text, and\nlearning behavior features from 3D human motion sequences. Previous work on\nmotion data modeling mainly relied on autoregressive feature extractors that\nmay forget previous information, while we propose an innovative model that\nincludes simple yet powerful transformer-based motion and text encoders, which\ncan learn representations from the two different modalities and capture\nlong-term dependencies. Furthermore, the overlap of the same atomic actions of\ndifferent human motions can cause semantic conflicts, leading us to explore a\nnew triplet loss function, MildTriple Loss. it leverages the similarity between\nsamples in intra-modal space to guide soft-hard negative sample mining in the\njoint embedding space to train the triplet loss and reduce the violation caused\nby false negative samples. We evaluated our model and method on the latest\nHumanML3D and KIT Motion-Language datasets, achieving a 62.9\\% recall for\nmotion retrieval and a 71.5\\% recall for text retrieval (based on R@10) on the\nHumanML3D dataset. Our code is available at\nhttps://github.com/eanson023/rehamot.\n","authors":["Sheng Yan","Haoqiang Wang","Xin Du","Mengyuan Liu","Hong Liu"],"pdf_url":"https://arxiv.org/pdf/2305.04195v2.pdf","comment":"This research was rejected by the submitted journal and needs to be\n revised before submitting"},{"id":"http://arxiv.org/abs/2307.08321v1","updated":"2023-07-17T08:38:46Z","published":"2023-07-17T08:38:46Z","title":"Legal Syllogism Prompting: Teaching Large Language Models for Legal\n Judgment Prediction","summary":" Legal syllogism is a form of deductive reasoning commonly used by legal\nprofessionals to analyze cases. In this paper, we propose legal syllogism\nprompting (LoT), a simple prompting method to teach large language models\n(LLMs) for legal judgment prediction. LoT teaches only that in the legal\nsyllogism the major premise is law, the minor premise is the fact, and the\nconclusion is judgment. Then the models can produce a syllogism reasoning of\nthe case and give the judgment without any learning, fine-tuning, or examples.\nOn CAIL2018, a Chinese criminal case dataset, we performed zero-shot judgment\nprediction experiments with GPT-3 models. Our results show that LLMs with LoT\nachieve better performance than the baseline and chain of thought prompting,\nthe state-of-art prompting method on diverse reasoning tasks. LoT enables the\nmodel to concentrate on the key information relevant to the judgment and to\ncorrectly understand the legal meaning of acts, as compared to other methods.\nOur method enables LLMs to predict judgment along with law articles and\njustification, which significantly enhances the explainability of models.\n","authors":["Cong Jiang","Xiaolei Yang"],"pdf_url":"https://arxiv.org/pdf/2307.08321v1.pdf","comment":"Nineteenth International Conference on Artificial Intelligence and\n Law (ICAIL 2023)"},{"id":"http://arxiv.org/abs/2307.08315v1","updated":"2023-07-17T08:23:09Z","published":"2023-07-17T08:23:09Z","title":"IterLara: A Turing Complete Algebra for Big Data, AI, Scientific\n Computing, and Database","summary":" \\textsc{Lara} is a key-value algebra that aims at unifying linear and\nrelational algebra with three types of operation abstraction. The study of\n\\textsc{Lara}'s expressive ability reports that it can represent relational\nalgebra and most linear algebra operations. However, several essential\ncomputations, such as matrix inversion and determinant, cannot be expressed in\n\\textsc{Lara}. \\textsc{Lara} cannot represent global and iterative computation,\neither. This article proposes \\textsc{IterLara}, extending \\textsc{Lara} with\niterative operators, to provide an algebraic model that unifies operations in\ngeneral-purpose computing, like big data, AI, scientific computing, and\ndatabase. We study the expressive ability of \\textsc{Lara} and\n\\textsc{IterLara} and prove that \\textsc{IterLara} with aggregation functions\ncan represent matrix inversion, determinant. Besides, we demonstrate that\n\\textsc{IterLara} with no limitation of function utility is Turing complete. We\nalso propose the Operation Count (OP) as a metric of computation amount for\n\\textsc{IterLara} and ensure that the OP metric is in accordance with the\nexisting computation metrics.\n","authors":["Hongxiao Li","Wanling Gao","Lei Wang","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.08315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12200v2","updated":"2023-07-17T08:09:34Z","published":"2023-02-23T17:51:29Z","title":"A Neural Span-Based Continual Named Entity Recognition Model","summary":" Named Entity Recognition (NER) models capable of Continual Learning (CL) are\nrealistically valuable in areas where entity types continuously increase (e.g.,\npersonal assistants). Meanwhile the learning paradigm of NER advances to new\npatterns such as the span-based methods. However, its potential to CL has not\nbeen fully explored. In this paper, we propose SpanKL, a simple yet effective\nSpan-based model with Knowledge distillation (KD) to preserve memories and\nmulti-Label prediction to prevent conflicts in CL-NER. Unlike prior sequence\nlabeling approaches, the inherently independent modeling in span and entity\nlevel with the designed coherent optimization on SpanKL promotes its learning\nat each incremental step and mitigates the forgetting. Experiments on synthetic\nCL datasets derived from OntoNotes and Few-NERD show that SpanKL significantly\noutperforms previous SoTA in many aspects, and obtains the smallest gap from CL\nto the upper bound revealing its high practiced value. The code is available at\nhttps://github.com/Qznan/SpanKL.\n","authors":["Yunan Zhang","Qingcai Chen"],"pdf_url":"https://arxiv.org/pdf/2302.12200v2.pdf","comment":"Accepted by AAAI'23 (Update to official format)"},{"id":"http://arxiv.org/abs/2307.08290v1","updated":"2023-07-17T07:24:55Z","published":"2023-07-17T07:24:55Z","title":"CoAD: Automatic Diagnosis through Symptom and Disease Collaborative\n Generation","summary":" Automatic diagnosis (AD), a critical application of AI in healthcare, employs\nmachine learning techniques to assist doctors in gathering patient symptom\ninformation for precise disease diagnosis. The Transformer-based method\nutilizes an input symptom sequence, predicts itself through auto-regression,\nand employs the hidden state of the final symptom to determine the disease.\nDespite its simplicity and superior performance demonstrated, a decline in\ndisease diagnosis accuracy is observed caused by 1) a mismatch between symptoms\nobserved during training and generation, and 2) the effect of different symptom\norders on disease prediction. To address the above obstacles, we introduce the\nCoAD, a novel disease and symptom collaborative generation framework, which\nincorporates several key innovations to improve AD: 1) aligning sentence-level\ndisease labels with multiple possible symptom inquiry steps to bridge the gap\nbetween training and generation; 2) expanding symptom labels for each\nsub-sequence of symptoms to enhance annotation and eliminate the effect of\nsymptom order; 3) developing a repeated symptom input schema to effectively and\nefficiently learn the expanded disease and symptom labels. We evaluate the CoAD\nframework using four datasets, including three public and one private, and\ndemonstrate that it achieves an average 2.3% improvement over previous\nstate-of-the-art results in automatic disease diagnosis. For reproducibility,\nwe release the code and data at https://github.com/KwanWaiChung/coad.\n","authors":["Huimin Wang","Wai-Chung Kwan","Kam-Fai Wong","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08290v1.pdf","comment":"Published as a conference paper at ACL 2023 (long). Code available at\n https://github.com/KwanWaiChung/coad"},{"id":"http://arxiv.org/abs/2303.06662v2","updated":"2023-07-17T07:21:50Z","published":"2023-03-12T13:51:38Z","title":"Fuzzy Alignments in Directed Acyclic Graph for Non-Autoregressive\n Machine Translation","summary":" Non-autoregressive translation (NAT) reduces the decoding latency but suffers\nfrom performance degradation due to the multi-modality problem. Recently, the\nstructure of directed acyclic graph has achieved great success in NAT, which\ntackles the multi-modality problem by introducing dependency between vertices.\nHowever, training it with negative log-likelihood loss implicitly requires a\nstrict alignment between reference tokens and vertices, weakening its ability\nto handle multiple translation modalities. In this paper, we hold the view that\nall paths in the graph are fuzzily aligned with the reference sentence. We do\nnot require the exact alignment but train the model to maximize a fuzzy\nalignment score between the graph and reference, which takes captured\ntranslations in all modalities into account. Extensive experiments on major WMT\nbenchmarks show that our method substantially improves translation performance\nand increases prediction confidence, setting a new state of the art for NAT on\nthe raw training data.\n","authors":["Zhengrui Ma","Chenze Shao","Shangtong Gui","Min Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2303.06662v2.pdf","comment":"ICLR 2023"},{"id":"http://arxiv.org/abs/2307.08272v1","updated":"2023-07-17T06:36:53Z","published":"2023-07-17T06:36:53Z","title":"ChatGPT is Good but Bing Chat is Better for Vietnamese Students","summary":" This paper investigates the performance of two large language models (LLMs),\nChatGPT and Microsoft Bing Chat (BingChat), for Vietnamese students. While\nChatGPT demonstrates competency in various subjects, Bing Chat emerges as the\nsuperior choice. We compare their performances across multiple subjects at high\nschool level, including mathematics, literature, English, physics, chemistry,\nbiology, history, geography, and civic education. Our findings indicate that\nBingChat surpasses ChatGPT in most subjects, except for literature where\nChatGPT outperforms. Moreover, BingChat leverages the more advanced GPT-4\ntechnology compared to ChatGPT based on GPT-3.5, leading to enhanced\nunderstanding and generation of creative and informative text. Furthermore,\nBingChat's availability in Vietnam and its incorporation of hyperlinks in\nanswers further solidify its superiority. We conclude that while ChatGPT is\ncommendable, Bing Chat offers a more comprehensive and advanced solution for\nVietnamese students.\n","authors":["Xuan-Quy Dao","Ngoc-Bich Le"],"pdf_url":"https://arxiv.org/pdf/2307.08272v1.pdf","comment":"12 pages; 6 figures. arxiv admin note: text overlap with\n arXiv:2305.12199"},{"id":"http://arxiv.org/abs/2307.07417v2","updated":"2023-07-17T06:08:22Z","published":"2023-07-11T14:44:14Z","title":"RoPDA: Robust Prompt-based Data Augmentation for Low-Resource Named\n Entity Recognition","summary":" Data augmentation has been widely used in low-resource NER tasks to tackle\nthe problem of data sparsity. However, previous data augmentation methods have\nthe disadvantages of disrupted syntactic structures, token-label mismatch, and\nrequirement for external knowledge or manual effort. To address these issues,\nwe propose Robust Prompt-based Data Augmentation (RoPDA) for low-resource NER.\nBased on pre-trained language models (PLMs) with continuous prompt, RoPDA\nperforms entity augmentation and context augmentation through five fundamental\naugmentation operations to generate label-flipping and label-preserving\nexamples. To optimize the utilization of the augmented samples, we present two\ntechniques: Self-Consistency Filtering and mixup. The former effectively\neliminates low-quality samples, while the latter prevents performance\ndegradation arising from the direct utilization of label-flipping samples.\nExtensive experiments on three benchmarks from different domains demonstrate\nthat RoPDA significantly improves upon strong baselines, and also outperforms\nstate-of-the-art semi-supervised learning methods when unlabeled data is\nincluded.\n","authors":["Sihan Song","Furao Shen","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.07417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08260v1","updated":"2023-07-17T06:06:58Z","published":"2023-07-17T06:06:58Z","title":"Extending the Frontier of ChatGPT: Code Generation and Debugging","summary":" Large-scale language models (LLMs) have emerged as a groundbreaking\ninnovation in the realm of question-answering and conversational agents. These\nmodels, leveraging different deep learning architectures such as Transformers,\nare trained on vast corpora to predict sentences based on given queries. Among\nthese LLMs, ChatGPT, developed by OpenAI, has ushered in a new era by utilizing\nartificial intelligence (AI) to tackle diverse problem domains, ranging from\ncomposing essays and biographies to solving intricate mathematical integrals.\nThe versatile applications enabled by ChatGPT offer immense value to users.\nHowever, assessing the performance of ChatGPT's output poses a challenge,\nparticularly in scenarios where queries lack clear objective criteria for\ncorrectness. For instance, evaluating the quality of generated essays becomes\narduous and relies heavily on manual labor, in stark contrast to evaluating\nsolutions to well-defined, closed-ended questions such as mathematical\nproblems. This research paper delves into the efficacy of ChatGPT in solving\nprogramming problems, examining both the correctness and the efficiency of its\nsolution in terms of time and memory complexity. The research reveals a\ncommendable overall success rate of 71.875\\%, denoting the proportion of\nproblems for which ChatGPT was able to provide correct solutions that\nsuccessfully satisfied all the test cases present in Leetcode. It exhibits\nstrengths in structured problems and shows a linear correlation between its\nsuccess rate and problem acceptance rates. However, it struggles to improve\nsolutions based on feedback, pointing to potential shortcomings in debugging\ntasks. These findings provide a compact yet insightful glimpse into ChatGPT's\ncapabilities and areas for improvement.\n","authors":["Fardin Ahsan Sakib","Saadat Hasan Khan","A. H. M. Rezaul Karim"],"pdf_url":"https://arxiv.org/pdf/2307.08260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08247v1","updated":"2023-07-17T05:05:15Z","published":"2023-07-17T05:05:15Z","title":"PAT: Parallel Attention Transformer for Visual Question Answering in\n Vietnamese","summary":" We present in this paper a novel scheme for multimodal learning named the\nParallel Attention mechanism. In addition, to take into account the advantages\nof grammar and context in Vietnamese, we propose the Hierarchical Linguistic\nFeatures Extractor instead of using an LSTM network to extract linguistic\nfeatures. Based on these two novel modules, we introduce the Parallel Attention\nTransformer (PAT), achieving the best accuracy compared to all baselines on the\nbenchmark ViVQA dataset and other SOTA methods including SAAA and MCAN.\n","authors":["Nghia Hieu Nguyen","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.08247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.05006v4","updated":"2023-07-17T03:58:28Z","published":"2021-10-11T05:30:30Z","title":"Pre-trained Language Models in Biomedical Domain: A Systematic Survey","summary":" Pre-trained language models (PLMs) have been the de facto paradigm for most\nnatural language processing (NLP) tasks. This also benefits biomedical domain:\nresearchers from informatics, medicine, and computer science (CS) communities\npropose various PLMs trained on biomedical datasets, e.g., biomedical text,\nelectronic health records, protein, and DNA sequences for various biomedical\ntasks. However, the cross-discipline characteristics of biomedical PLMs hinder\ntheir spreading among communities; some existing works are isolated from each\nother without comprehensive comparison and discussions. It expects a survey\nthat not only systematically reviews recent advances of biomedical PLMs and\ntheir applications but also standardizes terminology and benchmarks. In this\npaper, we summarize the recent progress of pre-trained language models in the\nbiomedical domain and their applications in biomedical downstream tasks.\nParticularly, we discuss the motivations and propose a taxonomy of existing\nbiomedical PLMs. Their applications in biomedical downstream tasks are\nexhaustively discussed. At last, we illustrate various limitations and future\ntrends, which we hope can provide inspiration for the future research of the\nresearch community.\n","authors":["Benyou Wang","Qianqian Xie","Jiahuan Pei","Zhihong Chen","Prayag Tiwari","Zhao Li","Jie fu"],"pdf_url":"https://arxiv.org/pdf/2110.05006v4.pdf","comment":"Accepted in ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2307.08217v1","updated":"2023-07-17T03:31:36Z","published":"2023-07-17T03:31:36Z","title":"BASS: Block-wise Adaptation for Speech Summarization","summary":" End-to-end speech summarization has been shown to improve performance over\ncascade baselines. However, such models are difficult to train on very large\ninputs (dozens of minutes or hours) owing to compute restrictions and are hence\ntrained with truncated model inputs. Truncation leads to poorer models, and a\nsolution to this problem rests in block-wise modeling, i.e., processing a\nportion of the input frames at a time. In this paper, we develop a method that\nallows one to train summarization models on very long sequences in an\nincremental manner. Speech summarization is realized as a streaming process,\nwhere hypothesis summaries are updated every block based on new acoustic\ninformation. We devise and test strategies to pass semantic context across the\nblocks. Experiments on the How2 dataset demonstrate that the proposed\nblock-wise training method improves by 3 points absolute on ROUGE-L over a\ntruncated input baseline.\n","authors":["Roshan Sharma","Kenneth Zheng","Siddhant Arora","Shinji Watanabe","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2307.08217v1.pdf","comment":"Accepted at Interspeech 2023"},{"id":"http://arxiv.org/abs/2307.08189v1","updated":"2023-07-17T01:35:56Z","published":"2023-07-17T01:35:56Z","title":"Mini-Giants: \"Small\" Language Models and Open Source Win-Win","summary":" ChatGPT is phenomenal. However, it is prohibitively expensive to train and\nrefine such giant models. Fortunately, small language models are flourishing\nand becoming more and more competent. We call them \"mini-giants\". We argue that\nopen source community like Kaggle and mini-giants will win-win in many ways,\ntechnically, ethically and socially. In this article, we present a brief yet\nrich background, discuss how to attain small language models, present a\ncomparative study of small language models and a brief discussion of evaluation\nmethods, discuss the application scenarios where small language models are most\nneeded in the real world, and conclude with discussion and outlook.\n","authors":["Zhengping Zhou","Lezhi Li","Xinxi Chen","Andy Li"],"pdf_url":"https://arxiv.org/pdf/2307.08189v1.pdf","comment":"16 pages, 1 figure"},{"id":"http://arxiv.org/abs/2307.08859v1","updated":"2023-07-17T21:33:35Z","published":"2023-07-17T21:33:35Z","title":"Curriculum Learning for Graph Neural Networks: A Multiview\n Competence-based Approach","summary":" A curriculum is a planned sequence of learning materials and an effective one\ncan make learning efficient and effective for both humans and machines. Recent\nstudies developed effective data-driven curriculum learning approaches for\ntraining graph neural networks in language applications. However, existing\ncurriculum learning approaches often employ a single criterion of difficulty in\ntheir training paradigms. In this paper, we propose a new perspective on\ncurriculum learning by introducing a novel approach that builds on graph\ncomplexity formalisms (as difficulty criteria) and model competence during\ntraining. The model consists of a scheduling scheme which derives effective\ncurricula by accounting for different views of sample difficulty and model\ncompetence during training. The proposed solution advances existing research in\ncurriculum learning for graph neural networks with the ability to incorporate a\nfine-grained spectrum of graph difficulty criteria in their training paradigms.\nExperimental results on real-world link prediction and node classification\ntasks illustrate the effectiveness of the proposed approach.\n","authors":["Nidhi Vakil","Hadi Amiri"],"pdf_url":"https://arxiv.org/pdf/2307.08859v1.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2307.08813v1","updated":"2023-07-17T20:01:11Z","published":"2023-07-17T20:01:11Z","title":"Comparative Performance Evaluation of Large Language Models for\n Extracting Molecular Interactions and Pathway Knowledge","summary":" Understanding protein interactions and pathway knowledge is crucial for\nunraveling the complexities of living systems and investigating the underlying\nmechanisms of biological functions and complex diseases. While existing\ndatabases provide curated biological data from literature and other sources,\nthey are often incomplete and their maintenance is labor-intensive,\nnecessitating alternative approaches. In this study, we propose to harness the\ncapabilities of large language models to address these issues by automatically\nextracting such knowledge from the relevant scientific literature. Toward this\ngoal, in this work, we investigate the effectiveness of different large\nlanguage models in tasks that involve recognizing protein interactions,\npathways, and gene regulatory relations. We thoroughly evaluate the performance\nof various models, highlight the significant findings, and discuss both the\nfuture opportunities and the remaining challenges associated with this\napproach. The code and data are available at:\nhttps://github.com/boxorange/BioIE-LLM\n","authors":["Gilchan Park","Byung-Jun Yoon","Xihaier Luo","Vanessa López-Marrero","Patrick Johnstone","Shinjae Yoo","Francis J. Alexander"],"pdf_url":"https://arxiv.org/pdf/2307.08813v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.14279v2","updated":"2023-07-17T19:01:03Z","published":"2023-05-23T17:25:59Z","title":"Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs","summary":" Large language models (LLMs) have achieved widespread success on a variety of\nin-context few-shot tasks, but this success is typically evaluated via\ncorrectness rather than consistency. We argue that self-consistency is an\nimportant criteria for valid multi-step reasoning in tasks where the solution\nis composed of the answers to multiple sub-steps. We propose two types of\nself-consistency that are particularly important for multi-step reasoning --\nhypothetical consistency (a model's ability to predict what its output would be\nin a hypothetical other context) and compositional consistency (consistency of\na model's final outputs when intermediate sub-steps are replaced with the\nmodel's outputs for those steps). We demonstrate that multiple variants of the\nGPT-3/-4 models exhibit poor consistency rates across both types of consistency\non a variety of tasks.\n","authors":["Angelica Chen","Jason Phang","Alicia Parrish","Vishakh Padmakumar","Chen Zhao","Samuel R. Bowman","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2305.14279v2.pdf","comment":"Added GPT-4 results"},{"id":"http://arxiv.org/abs/2307.08767v1","updated":"2023-07-17T18:27:49Z","published":"2023-07-17T18:27:49Z","title":"A mixed policy to improve performance of language models on math\n problems","summary":" When to solve math problems, most language models take a sampling strategy to\npredict next word according conditional probabilities. In the math reasoning\nstep, it may generate wrong answer. Considering math problems are\ndeterministic, we propose a mixed policy exploration approach to solve math\nproblems with reinforcement learning. In peculiar, we propose a two level token\nexploration policy: the abstract level explores next token with probability and\nthe second level is deterministic. Specifically, the abstract level policy will\ndecide whether the token is operator or operand with probability sampling,\nwhile the second level is deterministic to select next token with the highest\nscore in a greedy way. We test our method on GSM8K dataset with GPT-2 model,\nand demonstrate more than $2\\%$ performance gain. Our implementation is\navailable at https://github.com/vividitytech/math_lm_rl.\n","authors":["Gang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.08767v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2307.09390v1","updated":"2023-07-17T09:18:57Z","published":"2023-07-17T09:18:57Z","title":"How do software citation formats evolve over time? A longitudinal\n analysis of R programming language packages","summary":" Under the data-driven research paradigm, research software has come to play\ncrucial roles in nearly every stage of scientific inquiry. Scholars are\nadvocating for the formal citation of software in academic publications,\ntreating it on par with traditional research outputs. However, software is\nhardly consistently cited: one software entity can be cited as different\nobjects, and the citations can change over time. These issues, however, are\nlargely overlooked in existing empirical research on software citation. To fill\nthe above gaps, the present study compares and analyzes a longitudinal dataset\nof citation formats of all R packages collected in 2021 and 2022, in order to\nunderstand the citation formats of R-language packages, important members in\nthe open-source software family, and how the citations evolve over time. In\nparticular, we investigate the different document types underlying the\ncitations and what metadata elements in the citation formats changed over time.\nFurthermore, we offer an in-depth analysis of the disciplinarity of journal\narticles cited as software (software papers). By undertaking this research, we\naim to contribute to a better understanding of the complexities associated with\nsoftware citation, shedding light on future software citation policies and\ninfrastructure.\n","authors":["Yuzhuo Wang","Kai Li"],"pdf_url":"https://arxiv.org/pdf/2307.09390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08720v1","updated":"2023-07-17T04:19:30Z","published":"2023-07-17T04:19:30Z","title":"ivrit.ai: A Comprehensive Dataset of Hebrew Speech for AI Research and\n Development","summary":" We introduce \"ivrit.ai\", a comprehensive Hebrew speech dataset, addressing\nthe distinct lack of extensive, high-quality resources for advancing Automated\nSpeech Recognition (ASR) technology in Hebrew. With over 3,300 speech hours and\na over a thousand diverse speakers, ivrit.ai offers a substantial compilation\nof Hebrew speech across various contexts. It is delivered in three forms to\ncater to varying research needs: raw unprocessed audio; data post-Voice\nActivity Detection, and partially transcribed data. The dataset stands out for\nits legal accessibility, permitting use at no cost, thereby serving as a\ncrucial resource for researchers, developers, and commercial entities. ivrit.ai\nopens up numerous applications, offering vast potential to enhance AI\ncapabilities in Hebrew. Future efforts aim to expand ivrit.ai further, thereby\nadvancing Hebrew's standing in AI research and technology.\n","authors":["Yanir Marmor","Kinneret Misgav","Yair Lifshitz"],"pdf_url":"https://arxiv.org/pdf/2307.08720v1.pdf","comment":"9 pages, 1 table and 3 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.08702v1","updated":"2023-07-17T17:59:40Z","published":"2023-07-17T17:59:40Z","title":"Diffusion Models Beat GANs on Image Classification","summary":" While many unsupervised learning models focus on one family of tasks, either\ngenerative or discriminative, we explore the possibility of a unified\nrepresentation learner: a model which uses a single pre-training stage to\naddress both families of tasks simultaneously. We identify diffusion models as\na prime candidate. Diffusion models have risen to prominence as a\nstate-of-the-art method for image generation, denoising, inpainting,\nsuper-resolution, manipulation, etc. Such models involve training a U-Net to\niteratively predict and remove noise, and the resulting model can synthesize\nhigh fidelity, diverse, novel images. The U-Net architecture, as a\nconvolution-based architecture, generates a diverse set of feature\nrepresentations in the form of intermediate feature maps. We present our\nfindings that these embeddings are useful beyond the noise prediction task, as\nthey contain discriminative information and can also be leveraged for\nclassification. We explore optimal methods for extracting and using these\nembeddings for classification tasks, demonstrating promising results on the\nImageNet classification task. We find that with careful feature selection and\npooling, diffusion models outperform comparable generative-discriminative\nmethods such as BigBiGAN for classification tasks. We investigate diffusion\nmodels in the transfer learning regime, examining their performance on several\nfine-grained visual classification datasets. We compare these embeddings to\nthose generated by competing architectures and pre-trainings for classification\ntasks.\n","authors":["Soumik Mukhopadhyay","Matthew Gwilliam","Vatsal Agarwal","Namitha Padmanabhan","Archana Swaminathan","Srinidhi Hegde","Tianyi Zhou","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2307.08702v1.pdf","comment":"15 pages, 7 figures, 10 tables, submission under review"},{"id":"http://arxiv.org/abs/2302.03665v3","updated":"2023-07-17T17:59:37Z","published":"2023-02-07T18:34:59Z","title":"HumanMAC: Masked Motion Completion for Human Motion Prediction","summary":" Human motion prediction is a classical problem in computer vision and\ncomputer graphics, which has a wide range of practical applications. Previous\neffects achieve great empirical performance based on an encoding-decoding\nstyle. The methods of this style work by first encoding previous motions to\nlatent representations and then decoding the latent representations into\npredicted motions. However, in practice, they are still unsatisfactory due to\nseveral issues, including complicated loss constraints, cumbersome training\nprocesses, and scarce switch of different categories of motions in prediction.\nIn this paper, to address the above issues, we jump out of the foregoing style\nand propose a novel framework from a new perspective. Specifically, our\nframework works in a masked completion fashion. In the training stage, we learn\na motion diffusion model that generates motions from random noise. In the\ninference stage, with a denoising procedure, we make motion prediction\nconditioning on observed motions to output more continuous and controllable\npredictions. The proposed framework enjoys promising algorithmic properties,\nwhich only needs one loss in optimization and is trained in an end-to-end\nmanner. Additionally, it accomplishes the switch of different categories of\nmotions effectively, which is significant in realistic tasks, e.g., the\nanimation task. Comprehensive experiments on benchmarks confirm the superiority\nof the proposed framework. The project page is available at\nhttps://lhchen.top/Human-MAC.\n","authors":["Ling-Hao Chen","Jiawei Zhang","Yewen Li","Yiren Pang","Xiaobo Xia","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.03665v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08700v1","updated":"2023-07-17T17:59:09Z","published":"2023-07-17T17:59:09Z","title":"Fast model inference and training on-board of Satellites","summary":" Artificial intelligence onboard satellites has the potential to reduce data\ntransmission requirements, enable real-time decision-making and collaboration\nwithin constellations. This study deploys a lightweight foundational model\ncalled RaVAEn on D-Orbit's ION SCV004 satellite. RaVAEn is a variational\nauto-encoder (VAE) that generates compressed latent vectors from small image\ntiles, enabling several downstream tasks. In this work we demonstrate the\nreliable use of RaVAEn onboard a satellite, achieving an encoding time of\n0.110s for tiles of a 4.8x4.8 km$^2$ area. In addition, we showcase fast\nfew-shot training onboard a satellite using the latent representation of data.\nWe compare the deployment of the model on the on-board CPU and on the available\nMyriad vision processing unit (VPU) accelerator. To our knowledge, this work\nshows for the first time the deployment of a multi-task model on-board a\nCubeSat and the on-board training of a machine learning model.\n","authors":["Vít Růžička","Gonzalo Mateo-García","Chris Bridges","Chris Brunskill","Cormac Purcell","Nicolas Longépé","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2307.08700v1.pdf","comment":"4 pages, 4 figures, International Geoscience and Remote Sensing\n Symposium (IGARSS) 2023"},{"id":"http://arxiv.org/abs/2307.08699v1","updated":"2023-07-17T17:58:37Z","published":"2023-07-17T17:58:37Z","title":"Pair then Relation: Pair-Net for Panoptic Scene Graph Generation","summary":" Panoptic Scene Graph (PSG) is a challenging task in Scene Graph Generation\n(SGG) that aims to create a more comprehensive scene graph representation using\npanoptic segmentation instead of boxes. However, current PSG methods have\nlimited performance, which can hinder downstream task development. To improve\nPSG methods, we conducted an in-depth analysis to identify the bottleneck of\nthe current PSG models, finding that inter-object pair-wise recall is a crucial\nfactor which was ignored by previous PSG methods. Based on this, we present a\nnovel framework: Pair then Relation (Pair-Net), which uses a Pair Proposal\nNetwork (PPN) to learn and filter sparse pair-wise relationships between\nsubjects and objects. We also observed the sparse nature of object pairs and\nused this insight to design a lightweight Matrix Learner within the PPN.\nThrough extensive ablation and analysis, our approach significantly improves\nupon leveraging the strong segmenter baseline. Notably, our approach achieves\nnew state-of-the-art results on the PSG benchmark, with over 10% absolute gains\ncompared to PSGFormer. The code of this paper is publicly available at\nhttps://github.com/king159/Pair-Net.\n","authors":["Jinghao Wang","Zhengyu Wen","Xiangtai Li","Zujin Guo","Jingkang Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.08699v1.pdf","comment":"Project Page: https://github.com/king159/Pair-Net"},{"id":"http://arxiv.org/abs/2307.08698v1","updated":"2023-07-17T17:57:56Z","published":"2023-07-17T17:57:56Z","title":"Flow Matching in Latent Space","summary":" Flow matching is a recent framework to train generative models that exhibits\nimpressive empirical performance while being relatively easier to train\ncompared with diffusion-based models. Despite its advantageous properties,\nprior methods still face the challenges of expensive computing and a large\nnumber of function evaluations of off-the-shelf solvers in the pixel space.\nFurthermore, although latent-based generative methods have shown great success\nin recent years, this particular model type remains underexplored in this area.\nIn this work, we propose to apply flow matching in the latent spaces of\npretrained autoencoders, which offers improved computational efficiency and\nscalability for high-resolution image synthesis. This enables flow-matching\ntraining on constrained computational resources while maintaining their quality\nand flexibility. Additionally, our work stands as a pioneering contribution in\nthe integration of various conditions into flow matching for conditional\ngeneration tasks, including label-conditioned image generation, image\ninpainting, and semantic-to-image generation. Through extensive experiments,\nour approach demonstrates its effectiveness in both quantitative and\nqualitative results on various datasets, such as CelebA-HQ, FFHQ, LSUN Church &\nBedroom, and ImageNet. We also provide a theoretical control of the\nWasserstein-2 distance between the reconstructed latent flow distribution and\ntrue data distribution, showing it is upper-bounded by the latent flow matching\nobjective. Our code will be available at\nhttps://github.com/VinAIResearch/LFM.git.\n","authors":["Quan Dao","Hao Phung","Binh Nguyen","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2307.08698v1.pdf","comment":"Project Page: https://vinairesearch.github.io/LFM/"},{"id":"http://arxiv.org/abs/2307.08695v1","updated":"2023-07-17T17:57:01Z","published":"2023-07-17T17:57:01Z","title":"Neural Video Depth Stabilizer","summary":" Video depth estimation aims to infer temporally consistent depth. Some\nmethods achieve temporal consistency by finetuning a single-image depth model\nduring test time using geometry and re-projection constraints, which is\ninefficient and not robust. An alternative approach is to learn how to enforce\ntemporal consistency from data, but this requires well-designed models and\nsufficient video depth data. To address these challenges, we propose a\nplug-and-play framework called Neural Video Depth Stabilizer (NVDS) that\nstabilizes inconsistent depth estimations and can be applied to different\nsingle-image depth models without extra effort. We also introduce a large-scale\ndataset, Video Depth in the Wild (VDW), which consists of 14,203 videos with\nover two million frames, making it the largest natural-scene video depth\ndataset to our knowledge. We evaluate our method on the VDW dataset as well as\ntwo public benchmarks and demonstrate significant improvements in consistency,\naccuracy, and efficiency compared to previous approaches. Our work serves as a\nsolid baseline and provides a data foundation for learning-based video depth\nmodels. We will release our dataset and code for future research.\n","authors":["Yiran Wang","Min Shi","Jiaqi Li","Zihao Huang","Zhiguo Cao","Jianming Zhang","Ke Xian","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2307.08695v1.pdf","comment":"Our paper is accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.08693v1","updated":"2023-07-17T17:53:36Z","published":"2023-07-17T17:53:36Z","title":"SEMI-DiffusionInst: A Diffusion Model Based Approach for Semiconductor\n Defect Classification and Segmentation","summary":" With continuous progression of Moore's Law, integrated circuit (IC) device\ncomplexity is also increasing. Scanning Electron Microscope (SEM) image based\nextensive defect inspection and accurate metrology extraction are two main\nchallenges in advanced node (2 nm and beyond) technology. Deep learning (DL)\nalgorithm based computer vision approaches gained popularity in semiconductor\ndefect inspection over last few years. In this research work, a new\nsemiconductor defect inspection framework \"SEMI-DiffusionInst\" is investigated\nand compared to previous frameworks. To the best of the authors' knowledge,\nthis work is the first demonstration to accurately detect and precisely segment\nsemiconductor defect patterns by using a diffusion model. Different feature\nextractor networks as backbones and data sampling strategies are investigated\ntowards achieving a balanced trade-off between precision and computing\nefficiency. Our proposed approach outperforms previous work on overall mAP and\nperforms comparatively better or as per for almost all defect classes (per\nclass APs). The bounding box and segmentation mAPs achieved by the proposed\nSEMI-DiffusionInst model are improved by 3.83% and 2.10%,respectively. Among\nindividual defect types, precision on line collapse and thin bridge defects are\nimproved approximately 15% on detection task for both defect types. It has also\nbeen shown that by tuning inference hyperparameters, inference time can be\nimproved significantly without compromising model precision. Finally, certain\nlimitations and future work strategy to overcome them are discussed.\n","authors":["Vic De Ridder","Bappaditya Dey","Sandip Halder","Bartel Van Waeyenberge"],"pdf_url":"https://arxiv.org/pdf/2307.08693v1.pdf","comment":"6 pages, 5 figures, To be published by IEEE in the proceedings of the\n 2023 ELMAR conference"},{"id":"http://arxiv.org/abs/2307.08682v1","updated":"2023-07-17T17:44:18Z","published":"2023-07-17T17:44:18Z","title":"Implementation of a perception system for autonomous vehicles using a\n detection-segmentation network in SoC FPGA","summary":" Perception and control systems for autonomous vehicles are an active area of\nscientific and industrial research. These solutions should be characterised by\nhigh efficiency in recognising obstacles and other environmental elements in\ndifferent road conditions, real-time capability, and energy efficiency.\nAchieving such functionality requires an appropriate algorithm and a suitable\ncomputing platform. In this paper, we have used the MultiTaskV3\ndetection-segmentation network as the basis for a perception system that can\nperform both functionalities within a single architecture. It was appropriately\ntrained, quantised, and implemented on the AMD Xilinx Kria KV260 Vision AI\nembedded platform. By using this device, it was possible to parallelise and\naccelerate the computations. Furthermore, the whole system consumes relatively\nlittle power compared to a CPU-based implementation (an average of 5 watts,\ncompared to the minimum of 55 watts for weaker CPUs, and the small size (119mm\nx 140mm x 36mm) of the platform allows it to be used in devices where the\namount of space available is limited. It also achieves an accuracy higher than\n97% of the mAP (mean average precision) for object detection and above 90% of\nthe mIoU (mean intersection over union) for image segmentation. The article\nalso details the design of the Mecanum wheel vehicle, which was used to test\nthe proposed solution in a mock-up city.\n","authors":["Maciej Baczmanski","Mateusz Wasala","Tomasz Kryjak"],"pdf_url":"https://arxiv.org/pdf/2307.08682v1.pdf","comment":"The paper was accepted for the 19th International Symposium on\n Applied Reconfigurable Computing - ARC 2023, Cottbus - Germany"},{"id":"http://arxiv.org/abs/2307.08673v1","updated":"2023-07-17T17:34:32Z","published":"2023-07-17T17:34:32Z","title":"CohortFinder: an open-source tool for data-driven partitioning of\n biomedical image cohorts to yield robust machine learning models","summary":" Batch effects (BEs) refer to systematic technical differences in data\ncollection unrelated to biological variations whose noise is shown to\nnegatively impact machine learning (ML) model generalizability. Here we release\nCohortFinder, an open-source tool aimed at mitigating BEs via data-driven\ncohort partitioning. We demonstrate CohortFinder improves ML model performance\nin downstream medical image processing tasks. CohortFinder is freely available\nfor download at cohortfinder.com.\n","authors":["Fan Fan","Georgia Martinez","Thomas Desilvio","John Shin","Yijiang Chen","Bangchen Wang","Takaya Ozeki","Maxime W. Lafarge","Viktor H. Koelzer","Laura Barisoni","Anant Madabhushi","Satish E. Viswanath","Andrew Janowczyk"],"pdf_url":"https://arxiv.org/pdf/2307.08673v1.pdf","comment":"26 pages, 9 figures, 4 tables. Abstract was accepted by European\n Society of Digital and Integrative Pathology (ESDIP), Germany, 2022"},{"id":"http://arxiv.org/abs/2307.08663v1","updated":"2023-07-17T17:27:06Z","published":"2023-07-17T17:27:06Z","title":"Quaternion Convolutional Neural Networks: Current Advances and Future\n Directions","summary":" Since their first applications, Convolutional Neural Networks (CNNs) have\nsolved problems that have advanced the state-of-the-art in several domains.\nCNNs represent information using real numbers. Despite encouraging results,\ntheoretical analysis shows that representations such as hyper-complex numbers\ncan achieve richer representational capacities than real numbers, and that\nHamilton products can capture intrinsic interchannel relationships. Moreover,\nin the last few years, experimental research has shown that Quaternion-Valued\nCNNs (QCNNs) can achieve similar performance with fewer parameters than their\nreal-valued counterparts. This paper condenses research in the development of\nQCNNs from its very beginnings. We propose a conceptual organization of current\ntrends and analyze the main building blocks used in the design of QCNN models.\nBased on this conceptual organization, we propose future directions of\nresearch.\n","authors":["Gerardo Altamirano-Gomez","Carlos Gershenson"],"pdf_url":"https://arxiv.org/pdf/2307.08663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08636v1","updated":"2023-07-17T16:52:25Z","published":"2023-07-17T16:52:25Z","title":"PolyGNN: Polyhedron-based Graph Neural Network for 3D Building\n Reconstruction from Point Clouds","summary":" We present PolyGNN, a polyhedron-based graph neural network for 3D building\nreconstruction from point clouds. PolyGNN learns to assemble primitives\nobtained by polyhedral decomposition via graph node classification, achieving a\nwatertight, compact, and weakly semantic reconstruction. To effectively\nrepresent arbitrary-shaped polyhedra in the neural network, we propose three\ndifferent sampling strategies to select representative points as\npolyhedron-wise queries, enabling efficient occupancy inference. Furthermore,\nwe incorporate the inter-polyhedron adjacency to enhance the classification of\nthe graph nodes. We also observe that existing city-building models are\nabstractions of the underlying instances. To address this abstraction gap and\nprovide a fair evaluation of the proposed method, we develop our method on a\nlarge-scale synthetic dataset covering 500k+ buildings with well-defined ground\ntruths of polyhedral class labels. We further conduct a transferability\nanalysis across cities and on real-world point clouds. Both qualitative and\nquantitative results demonstrate the effectiveness of our method, particularly\nits efficiency for large-scale reconstructions. The source code and data of our\nwork are available at https://github.com/chenzhaiyu/polygnn.\n","authors":["Zhaiyu Chen","Yilei Shi","Liangliang Nan","Zhitong Xiong","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.08636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08629v1","updated":"2023-07-17T16:45:10Z","published":"2023-07-17T16:45:10Z","title":"Deficiency-Aware Masked Transformer for Video Inpainting","summary":" Recent video inpainting methods have made remarkable progress by utilizing\nexplicit guidance, such as optical flow, to propagate cross-frame pixels.\nHowever, there are cases where cross-frame recurrence of the masked video is\nnot available, resulting in a deficiency. In such situation, instead of\nborrowing pixels from other frames, the focus of the model shifts towards\naddressing the inverse problem. In this paper, we introduce a\ndual-modality-compatible inpainting framework called Deficiency-aware Masked\nTransformer (DMT), which offers three key advantages. Firstly, we pretrain a\nimage inpainting model DMT_img serve as a prior for distilling the video model\nDMT_vid, thereby benefiting the hallucination of deficiency cases. Secondly,\nthe self-attention module selectively incorporates spatiotemporal tokens to\naccelerate inference and remove noise signals. Thirdly, a simple yet effective\nReceptive Field Contextualizer is integrated into DMT, further improving\nperformance. Extensive experiments conducted on YouTube-VOS and DAVIS datasets\ndemonstrate that DMT_vid significantly outperforms previous solutions. The code\nand video demonstrations can be found at github.com/yeates/DMT.\n","authors":["Yongsheng Yu","Heng Fan","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08615v1","updated":"2023-07-17T16:30:44Z","published":"2023-07-17T16:30:44Z","title":"Benchmarking fixed-length Fingerprint Representations across different\n Embedding Sizes and Sensor Types","summary":" Traditional minutiae-based fingerprint representations consist of a\nvariable-length set of minutiae. This necessitates a more complex comparison\ncausing the drawback of high computational cost in one-to-many comparison.\nRecently, deep neural networks have been proposed to extract fixed-length\nembeddings from fingerprints. In this paper, we explore to what extent\nfingerprint texture information contained in such embeddings can be reduced in\nterms of dimension while preserving high biometric performance. This is of\nparticular interest since it would allow to reduce the number of operations\nincurred at comparisons. We also study the impact in terms of recognition\nperformance of the fingerprint textural information for two sensor types, i.e.\noptical and capacitive. Furthermore, the impact of rotation and translation of\nfingerprint images on the extraction of fingerprint embeddings is analysed.\nExperimental results conducted on a publicly available database reveal an\noptimal embedding size of 512 feature elements for the texture-based embedding\npart of fixed-length fingerprint representations. In addition, differences in\nperformance between sensor types can be perceived.\n","authors":["Tim Rohwedder","Daile Osorio-Roig","Christian Rathgeb","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2307.08615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08597v1","updated":"2023-07-17T16:07:07Z","published":"2023-07-17T16:07:07Z","title":"Multimodal Diffusion Segmentation Model for Object Segmentation from\n Manipulation Instructions","summary":" In this study, we aim to develop a model that comprehends a natural language\ninstruction (e.g., \"Go to the living room and get the nearest pillow to the\nradio art on the wall\") and generates a segmentation mask for the target\neveryday object. The task is challenging because it requires (1) the\nunderstanding of the referring expressions for multiple objects in the\ninstruction, (2) the prediction of the target phrase of the sentence among the\nmultiple phrases, and (3) the generation of pixel-wise segmentation masks\nrather than bounding boxes. Studies have been conducted on languagebased\nsegmentation methods; however, they sometimes mask irrelevant regions for\ncomplex sentences. In this paper, we propose the Multimodal Diffusion\nSegmentation Model (MDSM), which generates a mask in the first stage and\nrefines it in the second stage. We introduce a crossmodal parallel feature\nextraction mechanism and extend diffusion probabilistic models to handle\ncrossmodal features. To validate our model, we built a new dataset based on the\nwell-known Matterport3D and REVERIE datasets. This dataset consists of\ninstructions with complex referring expressions accompanied by real indoor\nenvironmental images that feature various target objects, in addition to\npixel-wise segmentation masks. The performance of MDSM surpassed that of the\nbaseline method by a large margin of +10.13 mean IoU.\n","authors":["Yui Iioka","Yu Yoshida","Yuiga Wada","Shumpei Hatanaka","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2307.08597v1.pdf","comment":"Accepted for presentation at IROS2023"},{"id":"http://arxiv.org/abs/2307.08585v1","updated":"2023-07-17T15:57:52Z","published":"2023-07-17T15:57:52Z","title":"Identity-Preserving Aging of Face Images via Latent Diffusion Models","summary":" The performance of automated face recognition systems is inevitably impacted\nby the facial aging process. However, high quality datasets of individuals\ncollected over several years are typically small in scale. In this work, we\npropose, train, and validate the use of latent text-to-image diffusion models\nfor synthetically aging and de-aging face images. Our models succeed with\nfew-shot training, and have the added benefit of being controllable via\nintuitive textual prompting. We observe high degrees of visual realism in the\ngenerated images while maintaining biometric fidelity measured by commonly used\nmetrics. We evaluate our method on two benchmark datasets (CelebA and AgeDB)\nand observe significant reduction (~44%) in the False Non-Match Rate compared\nto existing state-of the-art baselines.\n","authors":["Sudipta Banerjee","Govind Mittal","Ameya Joshi","Chinmay Hegde","Nasir Memon"],"pdf_url":"https://arxiv.org/pdf/2307.08585v1.pdf","comment":"Accepted to appear in International Joint Conference in Biometrics\n (IJCB) 2023"},{"id":"http://arxiv.org/abs/2307.08581v1","updated":"2023-07-17T15:51:47Z","published":"2023-07-17T15:51:47Z","title":"BuboGPT: Enabling Visual Grounding in Multi-Modal LLMs","summary":" LLMs have demonstrated remarkable abilities at interacting with humans\nthrough language, especially with the usage of instruction-following data.\nRecent advancements in LLMs, such as MiniGPT-4, LLaVA, and X-LLM, further\nenlarge their abilities by incorporating multi-modal inputs, including image,\nvideo, and speech. Despite their effectiveness at generating precise and\ndetailed language understanding of the given modality signal, these LLMs give\nup the ability to ground specific parts of inputs, thus only constructing a\ncoarse-grained mapping. However, explicit and informative correspondence\nbetween text and other modalities will not only improve the user experience but\nalso help to expand the application scenario of multi-modal LLMs. Therefore, we\npropose BuboGPT, a multi-modal LLM with visual grounding that can perform\ncross-modal interaction between vision, audio and language, providing\nfine-grained understanding of visual objects and other given modalities. As a\nresult, BuboGPT is able to point out the specific location of an object in the\nimage, when it is generating response or description for that object. Our\ncontributions are two-fold: 1) An off-the-shelf visual grounding module based\non SAM that extracts entities in a sentence and find corresponding masks in the\nimage. 2) A two-stage training scheme and instruction dataset to endow joint\ntext-image-audio understanding. Our experiments show that BuboGPT achieves\nimpressive multi-modality understanding and visual grounding abilities during\nthe interaction with human. It performs consistently well when provided by\narbitrary modality combinations (either aligned or unaligned). Our code, model\nand dataset are available at https://bubo-gpt.github.io .\n","authors":["Yang Zhao","Zhijie Lin","Daquan Zhou","Zilong Huang","Jiashi Feng","Bingyi Kang"],"pdf_url":"https://arxiv.org/pdf/2307.08581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09508v2","updated":"2023-07-17T15:51:03Z","published":"2023-03-16T17:24:41Z","title":"LDMVFI: Video Frame Interpolation with Latent Diffusion Models","summary":" Existing works on video frame interpolation (VFI) mostly employ deep neural\nnetworks trained to minimize the L1 or L2 distance between their outputs and\nground-truth frames. Despite recent advances, existing VFI methods tend to\nproduce perceptually inferior results, particularly for challenging scenarios\nincluding large motions and dynamic textures. Towards developing\nperceptually-oriented VFI methods, we propose latent diffusion model-based VFI,\nLDMVFI. This approaches the VFI problem from a generative perspective by\nformulating it as a conditional generation problem. As the first effort to\naddress VFI using latent diffusion models, we rigorously benchmark our method\nfollowing the common evaluation protocol adopted in the existing VFI\nliterature. Our quantitative experiments and user study indicate that LDMVFI is\nable to interpolate video content with superior perceptual quality compared to\nthe state of the art, even in the high-resolution regime. Our source code will\nbe made available here.\n","authors":["Duolikun Danier","Fan Zhang","David Bull"],"pdf_url":"https://arxiv.org/pdf/2303.09508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08579v1","updated":"2023-07-17T15:47:48Z","published":"2023-07-17T15:47:48Z","title":"Scale-Aware Modulation Meet Transformer","summary":" This paper presents a new vision Transformer, Scale-Aware Modulation\nTransformer (SMT), that can handle various downstream tasks efficiently by\ncombining the convolutional network and vision Transformer. The proposed\nScale-Aware Modulation (SAM) in the SMT includes two primary novel designs.\nFirstly, we introduce the Multi-Head Mixed Convolution (MHMC) module, which can\ncapture multi-scale features and expand the receptive field. Secondly, we\npropose the Scale-Aware Aggregation (SAA) module, which is lightweight but\neffective, enabling information fusion across different heads. By leveraging\nthese two modules, convolutional modulation is further enhanced. Furthermore,\nin contrast to prior works that utilized modulations throughout all stages to\nbuild an attention-free network, we propose an Evolutionary Hybrid Network\n(EHN), which can effectively simulate the shift from capturing local to global\ndependencies as the network becomes deeper, resulting in superior performance.\nExtensive experiments demonstrate that SMT significantly outperforms existing\nstate-of-the-art models across a wide range of visual tasks. Specifically, SMT\nwith 11.5M / 2.4GFLOPs and 32M / 7.7GFLOPs can achieve 82.2% and 84.3% top-1\naccuracy on ImageNet-1K, respectively. After pretrained on ImageNet-22K in\n224^2 resolution, it attains 87.1% and 88.1% top-1 accuracy when finetuned with\nresolution 224^2 and 384^2, respectively. For object detection with Mask R-CNN,\nthe SMT base trained with 1x and 3x schedule outperforms the Swin Transformer\ncounterpart by 4.2 and 1.3 mAP on COCO, respectively. For semantic segmentation\nwith UPerNet, the SMT base test at single- and multi-scale surpasses Swin by\n2.0 and 1.1 mIoU respectively on the ADE20K.\n","authors":["Weifeng Lin","Ziheng Wu","Jiayu Chen","Jun Huang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2307.08579v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2212.07158v2","updated":"2023-07-17T15:45:19Z","published":"2022-12-14T11:20:24Z","title":"Establishing a stronger baseline for lightweight contrastive models","summary":" Recent research has reported a performance degradation in self-supervised\ncontrastive learning for specially designed efficient networks, such as\nMobileNet and EfficientNet. A common practice to address this problem is to\nintroduce a pretrained contrastive teacher model and train the lightweight\nnetworks with distillation signals generated by the teacher. However, it is\ntime and resource consuming to pretrain a teacher model when it is not\navailable. In this work, we aim to establish a stronger baseline for\nlightweight contrastive models without using a pretrained teacher model.\nSpecifically, we show that the optimal recipe for efficient models is different\nfrom that of larger models, and using the same training settings as ResNet50,\nas previous research does, is inappropriate. Additionally, we observe a common\nissu e in contrastive learning where either the positive or negative views can\nbe noisy, and propose a smoothed version of InfoNCE loss to alleviate this\nproblem. As a result, we successfully improve the linear evaluation results\nfrom 36.3\\% to 62.3\\% for MobileNet-V3-Large and from 42.2\\% to 65.8\\% for\nEfficientNet-B0 on ImageNet, closing the accuracy gap to ResNet50 with\n$5\\times$ fewer parameters. We hope our research will facilitate the usage of\nlightweight contrastive models.\n","authors":["Wenye Lin","Yifeng Ding","Zhixiong Cao","Hai-tao Zheng"],"pdf_url":"https://arxiv.org/pdf/2212.07158v2.pdf","comment":"ICME 2023 oral"},{"id":"http://arxiv.org/abs/2307.08551v1","updated":"2023-07-17T15:31:58Z","published":"2023-07-17T15:31:58Z","title":"On the Fly Neural Style Smoothing for Risk-Averse Domain Generalization","summary":" Achieving high accuracy on data from domains unseen during training is a\nfundamental challenge in domain generalization (DG). While state-of-the-art DG\nclassifiers have demonstrated impressive performance across various tasks, they\nhave shown a bias towards domain-dependent information, such as image styles,\nrather than domain-invariant information, such as image content. This bias\nrenders them unreliable for deployment in risk-sensitive scenarios such as\nautonomous driving where a misclassification could lead to catastrophic\nconsequences. To enable risk-averse predictions from a DG classifier, we\npropose a novel inference procedure, Test-Time Neural Style Smoothing (TT-NSS),\nthat uses a \"style-smoothed\" version of the DG classifier for prediction at\ntest time. Specifically, the style-smoothed classifier classifies a test image\nas the most probable class predicted by the DG classifier on random\nre-stylizations of the test image. TT-NSS uses a neural style transfer module\nto stylize a test image on the fly, requires only black-box access to the DG\nclassifier, and crucially, abstains when predictions of the DG classifier on\nthe stylized test images lack consensus. Additionally, we propose a neural\nstyle smoothing (NSS) based training procedure that can be seamlessly\nintegrated with existing DG methods. This procedure enhances prediction\nconsistency, improving the performance of TT-NSS on non-abstained samples. Our\nempirical results demonstrate the effectiveness of TT-NSS and NSS at producing\nand improving risk-averse predictions on unseen domains from DG classifiers\ntrained with SOTA training methods on various benchmark datasets and their\nvariations.\n","authors":["Akshay Mehra","Yunbei Zhang","Bhavya Kailkhura","Jihun Hamm"],"pdf_url":"https://arxiv.org/pdf/2307.08551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08559v1","updated":"2023-07-17T15:17:39Z","published":"2023-07-17T15:17:39Z","title":"Improving Data Efficiency for Plant Cover Prediction with Label\n Interpolation and Monte-Carlo Cropping","summary":" The plant community composition is an essential indicator of environmental\nchanges and is, for this reason, usually analyzed in ecological field studies\nin terms of the so-called plant cover. The manual acquisition of this kind of\ndata is time-consuming, laborious, and prone to human error. Automated camera\nsystems can collect high-resolution images of the surveyed vegetation plots at\na high frequency. In combination with subsequent algorithmic analysis, it is\npossible to objectively extract information on plant community composition\nquickly and with little human effort. An automated camera system can easily\ncollect the large amounts of image data necessary to train a Deep Learning\nsystem for automatic analysis. However, due to the amount of work required to\nannotate vegetation images with plant cover data, only few labeled samples are\navailable. As automated camera systems can collect many pictures without\nlabels, we introduce an approach to interpolate the sparse labels in the\ncollected vegetation plot time series down to the intermediate dense and\nunlabeled images to artificially increase our training dataset to seven times\nits original size. Moreover, we introduce a new method we call Monte-Carlo\nCropping. This approach trains on a collection of cropped parts of the training\nimages to deal with high-resolution images efficiently, implicitly augment the\ntraining images, and speed up training. We evaluate both approaches on a plant\ncover dataset containing images of herbaceous plant communities and find that\nour methods lead to improvements in the species, community, and segmentation\nmetrics investigated.\n","authors":["Matthias Körschens","Solveig Franziska Bucher","Christine Römermann","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2307.08559v1.pdf","comment":"Accepted for publication at DAGM-GCPR 2023"},{"id":"http://arxiv.org/abs/2307.08544v1","updated":"2023-07-17T15:04:00Z","published":"2023-07-17T15:04:00Z","title":"Reconstructed Convolution Module Based Look-Up Tables for Efficient\n Image Super-Resolution","summary":" Look-up table(LUT)-based methods have shown the great efficacy in single\nimage super-resolution (SR) task. However, previous methods ignore the\nessential reason of restricted receptive field (RF) size in LUT, which is\ncaused by the interaction of space and channel features in vanilla convolution.\nThey can only increase the RF at the cost of linearly increasing LUT size. To\nenlarge RF with contained LUT sizes, we propose a novel Reconstructed\nConvolution(RC) module, which decouples channel-wise and spatial calculation.\nIt can be formulated as $n^2$ 1D LUTs to maintain $n\\times n$ receptive field,\nwhich is obviously smaller than $n\\times n$D LUT formulated before. The LUT\ngenerated by our RC module reaches less than 1/10000 storage compared with\nSR-LUT baseline. The proposed Reconstructed Convolution module based LUT\nmethod, termed as RCLUT, can enlarge the RF size by 9 times than the\nstate-of-the-art LUT-based SR method and achieve superior performance on five\npopular benchmark dataset. Moreover, the efficient and robust RC module can be\nused as a plugin to improve other LUT-based SR methods. The code is available\nat https://github.com/liuguandu/RC-LUT.\n","authors":["Guandu Liu","Yukang Ding","Mading Li","Ming Sun","Xing Wen","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07246v2","updated":"2023-07-17T15:02:26Z","published":"2023-07-14T09:38:22Z","title":"Knowledge Boosting: Rethinking Medical Contrastive Vision-Language\n Pre-Training","summary":" The foundation models based on pre-training technology have significantly\nadvanced artificial intelligence from theoretical to practical applications.\nThese models have facilitated the feasibility of computer-aided diagnosis for\nwidespread use. Medical contrastive vision-language pre-training, which does\nnot require human annotations, is an effective approach for guiding\nrepresentation learning using description information in diagnostic reports.\nHowever, the effectiveness of pre-training is limited by the large-scale\nsemantic overlap and shifting problems in medical field. To address these\nissues, we propose the Knowledge-Boosting Contrastive Vision-Language\nPre-training framework (KoBo), which integrates clinical knowledge into the\nlearning of vision-language semantic consistency. The framework uses an\nunbiased, open-set sample-wise knowledge representation to measure negative\nsample noise and supplement the correspondence between vision-language mutual\ninformation and clinical knowledge. Extensive experiments validate the effect\nof our framework on eight tasks including classification, segmentation,\nretrieval, and semantic relatedness, achieving comparable or better performance\nwith the zero-shot or few-shot settings. Our code is open on\nhttps://github.com/ChenXiaoFei-CS/KoBo.\n","authors":["Xiaofei Chen","Yuting He","Cheng Xue","Rongjun Ge","Shuo Li","Guanyu Yang"],"pdf_url":"https://arxiv.org/pdf/2307.07246v2.pdf","comment":"accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.08536v1","updated":"2023-07-17T14:53:09Z","published":"2023-07-17T14:53:09Z","title":"Variational Probabilistic Fusion Network for RGB-T Semantic Segmentation","summary":" RGB-T semantic segmentation has been widely adopted to handle hard scenes\nwith poor lighting conditions by fusing different modality features of RGB and\nthermal images. Existing methods try to find an optimal fusion feature for\nsegmentation, resulting in sensitivity to modality noise, class-imbalance, and\nmodality bias. To overcome the problems, this paper proposes a novel\nVariational Probabilistic Fusion Network (VPFNet), which regards fusion\nfeatures as random variables and obtains robust segmentation by averaging\nsegmentation results under multiple samples of fusion features. The random\nsamples generation of fusion features in VPFNet is realized by a novel\nVariational Feature Fusion Module (VFFM) designed based on variation attention.\nTo further avoid class-imbalance and modality bias, we employ the weighted\ncross-entropy loss and introduce prior information of illumination and category\nto control the proposed VFFM. Experimental results on MFNet and PST900 datasets\ndemonstrate that the proposed VPFNet can achieve state-of-the-art segmentation\nperformance.\n","authors":["Baihong Lin","Zengrong Lin","Yulan Guo","Yulan Zhang","Jianxiao Zou","Shicai Fan"],"pdf_url":"https://arxiv.org/pdf/2307.08536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08535v1","updated":"2023-07-17T14:52:52Z","published":"2023-07-17T14:52:52Z","title":"Multi-class point cloud completion networks for 3D cardiac anatomy\n reconstruction from cine magnetic resonance images","summary":" Cine magnetic resonance imaging (MRI) is the current gold standard for the\nassessment of cardiac anatomy and function. However, it typically only acquires\na set of two-dimensional (2D) slices of the underlying three-dimensional (3D)\nanatomy of the heart, thus limiting the understanding and analysis of both\nhealthy and pathological cardiac morphology and physiology. In this paper, we\npropose a novel fully automatic surface reconstruction pipeline capable of\nreconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI\nacquisitions. Its key component is a multi-class point cloud completion network\n(PCCN) capable of correcting both the sparsity and misalignment issues of the\n3D reconstruction task in a unified model. We first evaluate the PCCN on a\nlarge synthetic dataset of biventricular anatomies and observe Chamfer\ndistances between reconstructed and gold standard anatomies below or similar to\nthe underlying image resolution for multiple levels of slice misalignment.\nFurthermore, we find a reduction in reconstruction error compared to a\nbenchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean\nsurface distance, respectively. We then apply the PCCN as part of our automated\nreconstruction pipeline to 1000 subjects from the UK Biobank study in a\ncross-domain transfer setting and demonstrate its ability to reconstruct\naccurate and topologically plausible biventricular heart meshes with clinical\nmetrics comparable to the previous literature. Finally, we investigate the\nrobustness of our proposed approach and observe its capacity to successfully\nhandle multiple common outlier conditions.\n","authors":["Marcel Beetz","Abhirup Banerjee","Julius Ossenberg-Engels","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.08535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09765v2","updated":"2023-07-17T14:42:47Z","published":"2023-02-20T05:15:23Z","title":"ENInst: Enhancing Weakly-supervised Low-shot Instance Segmentation","summary":" We address a weakly-supervised low-shot instance segmentation, an\nannotation-efficient training method to deal with novel classes effectively.\nSince it is an under-explored problem, we first investigate the difficulty of\nthe problem and identify the performance bottleneck by conducting systematic\nanalyses of model components and individual sub-tasks with a simple baseline\nmodel. Based on the analyses, we propose ENInst with sub-task enhancement\nmethods: instance-wise mask refinement for enhancing pixel localization quality\nand novel classifier composition for improving classification accuracy. Our\nproposed method lifts the overall performance by enhancing the performance of\neach sub-task. We demonstrate that our ENInst is 7.5 times more efficient in\nachieving comparable performance to the existing fully-supervised few-shot\nmodels and even outperforms them at times.\n","authors":["Moon Ye-Bin","Dongmin Choi","Yongjin Kwon","Junsik Kim","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2302.09765v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.08528v1","updated":"2023-07-17T14:40:16Z","published":"2023-07-17T14:40:16Z","title":"Multi-Domain Learning with Modulation Adapters","summary":" Deep convolutional networks are ubiquitous in computer vision, due to their\nexcellent performance across different tasks for various domains. Models are,\nhowever, often trained in isolation for each task, failing to exploit\nrelatedness between tasks and domains to learn more compact models that\ngeneralise better in low-data regimes. Multi-domain learning aims to handle\nrelated tasks, such as image classification across multiple domains,\nsimultaneously. Previous work on this problem explored the use of a pre-trained\nand fixed domain-agnostic base network, in combination with smaller learnable\ndomain-specific adaptation modules. In this paper, we introduce Modulation\nAdapters, which update the convolutional filter weights of the model in a\nmultiplicative manner for each task. Parameterising these adaptation weights in\na factored manner allows us to scale the number of per-task parameters in a\nflexible manner, and to strike different parameter-accuracy trade-offs. We\nevaluate our approach on the Visual Decathlon challenge, composed of ten image\nclassification tasks across different domains, and on the ImageNet-to-Sketch\nbenchmark, which consists of six image classification tasks. Our approach\nyields excellent results, with accuracies that are comparable to or better than\nthose of existing state-of-the-art approaches.\n","authors":["Ekaterina Iakovleva","Karteek Alahari","Jakob Verbeek"],"pdf_url":"https://arxiv.org/pdf/2307.08528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08526v1","updated":"2023-07-17T14:38:11Z","published":"2023-07-17T14:38:11Z","title":"Image Captions are Natural Prompts for Text-to-Image Models","summary":" With the rapid development of Artificial Intelligence Generated Content\n(AIGC), it has become common practice in many learning tasks to train or\nfine-tune large models on synthetic data due to the data-scarcity and privacy\nleakage problems. Albeit promising with unlimited data generation, owing to\nmassive and diverse information conveyed in real images, it is challenging for\ntext-to-image generative models to synthesize informative training data with\nhand-crafted prompts, which usually leads to inferior generalization\nperformance when training downstream models. In this paper, we theoretically\nanalyze the relationship between the training effect of synthetic data and the\nsynthetic data distribution induced by prompts. Then we correspondingly propose\na simple yet effective method that prompts text-to-image generative models to\nsynthesize more informative and diverse training data. Specifically, we caption\neach real image with the advanced captioning model to obtain informative and\nfaithful prompts that extract class-relevant information and clarify the\npolysemy of class names. The image captions and class names are concatenated to\nprompt generative models for training image synthesis. Extensive experiments on\nImageNette, ImageNet-100, and ImageNet-1K verify that our method significantly\nimproves the performance of models trained on synthetic training data, i.e.,\n10% classification accuracy improvements on average.\n","authors":["Shiye Lei","Hao Chen","Sen Zhang","Bo Zhao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.08526v1.pdf","comment":"20 pages, 1 figure, 10 tables"},{"id":"http://arxiv.org/abs/2302.06441v3","updated":"2023-07-17T14:13:25Z","published":"2023-02-13T15:19:51Z","title":"ContrasInver: Ultra-Sparse Label Semi-supervised Regression for\n Multi-dimensional Seismic Inversion","summary":" The automated interpretation and inversion of seismic data have advanced\nsignificantly with the development of Deep Learning (DL) methods. However,\nthese methods often require numerous costly well logs, limiting their\napplication only to mature or synthetic data. This paper presents ContrasInver,\na method that achieves seismic inversion using as few as two or three well\nlogs, significantly reducing current requirements. In ContrasInver, we propose\nthree key innovations to address the challenges of applying semi-supervised\nlearning to regression tasks with ultra-sparse labels. The Multi-dimensional\nSample Generation (MSG) technique pioneers a paradigm for sample generation in\nmulti-dimensional inversion. It produces a large number of diverse samples from\na single well, while establishing lateral continuity in seismic data. MSG\nyields substantial improvements over current techniques, even without the use\nof semi-supervised learning. The Region-Growing Training (RGT) strategy\nleverages the inherent continuity of seismic data, effectively propagating\naccuracy from closer to more distant regions based on the proximity of well\nlogs. The Impedance Vectorization Projection (IVP) vectorizes impedance values\nand performs semi-supervised learning in a compressed space. We demonstrated\nthat the Jacobian matrix derived from this space can filter out some outlier\ncomponents in pseudo-label vectors, thereby solving the value confusion issue\nin semi-supervised regression learning. In the experiments, ContrasInver\nachieved state-of-the-art performance in the synthetic data SEAM I. In the\nfield data with two or three well logs, only the methods based on the\ncomponents proposed in this paper were able to achieve reasonable results. It's\nthe first data-driven approach yielding reliable results on the Netherlands F3\nand Delft, using only three and two well logs respectively.\n","authors":["Yimin Dou","Kewen Li","Wenjun Lv","Timing Li","Hongjie Duan","Zhifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2302.06441v3.pdf","comment":"This work has been submitted to journal for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2307.08506v1","updated":"2023-07-17T14:08:38Z","published":"2023-07-17T14:08:38Z","title":"Does Visual Pretraining Help End-to-End Reasoning?","summary":" We aim to investigate whether end-to-end learning of visual reasoning can be\nachieved with general-purpose neural networks, with the help of visual\npretraining. A positive result would refute the common belief that explicit\nvisual abstraction (e.g. object detection) is essential for compositional\ngeneralization on visual reasoning, and confirm the feasibility of a neural\nnetwork \"generalist\" to solve visual recognition and reasoning tasks. We\npropose a simple and general self-supervised framework which \"compresses\" each\nvideo frame into a small set of tokens with a transformer network, and\nreconstructs the remaining frames based on the compressed temporal context. To\nminimize the reconstruction loss, the network must learn a compact\nrepresentation for each image, as well as capture temporal dynamics and object\npermanence from temporal context. We perform evaluation on two visual reasoning\nbenchmarks, CATER and ACRE. We observe that pretraining is essential to achieve\ncompositional generalization for end-to-end visual reasoning. Our proposed\nframework outperforms traditional supervised pretraining, including image\nclassification and explicit object detection, by large margins.\n","authors":["Chen Sun","Calvin Luo","Xingyi Zhou","Anurag Arnab","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2307.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08504v1","updated":"2023-07-17T14:08:17Z","published":"2023-07-17T14:08:17Z","title":"BUS:Efficient and Effective Vision-language Pre-training with Bottom-Up\n Patch Summarization","summary":" Vision Transformer (ViT) based Vision-Language Pre-training (VLP) models have\ndemonstrated impressive performance in various tasks. However, the lengthy\nvisual token sequences fed into ViT can lead to training inefficiency and\nineffectiveness. Existing efforts address the challenge by either bottom-level\npatch extraction in the ViT backbone or top-level patch abstraction outside,\nnot balancing training efficiency and effectiveness well. Inspired by text\nsummarization in natural language processing, we propose a Bottom-Up Patch\nSummarization approach named BUS, coordinating bottom-level extraction and\ntop-level abstraction to learn a concise summary of lengthy visual token\nsequences efficiently. Specifically, We incorporate a Text-Semantics-Aware\nPatch Selector (TSPS) into the ViT backbone to perform a coarse-grained visual\ntoken extraction and then attach a flexible Transformer-based Patch Abstraction\nDecoder (PAD) upon the backbone for top-level visual abstraction. This\nbottom-up collaboration enables our BUS to yield high training efficiency while\nmaintaining or even improving effectiveness. We evaluate our approach on\nvarious visual-language understanding and generation tasks and show competitive\ndownstream task performance while boosting the training efficiency by 50\\%.\nAdditionally, our model achieves state-of-the-art performance on many\ndownstream tasks by increasing input image resolution without increasing\ncomputational costs over baselines.\n","authors":["Chaoya Jiang","Haiyang Xu","Wei Ye","Qinghao Ye","Chenliang Li","Ming Yan","Bin Bi","Shikun Zhang","Fei Huang","Songfang Huang"],"pdf_url":"https://arxiv.org/pdf/2307.08504v1.pdf","comment":"Accepted on ICCV2023"},{"id":"http://arxiv.org/abs/2307.08500v1","updated":"2023-07-17T14:03:45Z","published":"2023-07-17T14:03:45Z","title":"Cumulative Spatial Knowledge Distillation for Vision Transformers","summary":" Distilling knowledge from convolutional neural networks (CNNs) is a\ndouble-edged sword for vision transformers (ViTs). It boosts the performance\nsince the image-friendly local-inductive bias of CNN helps ViT learn faster and\nbetter, but leading to two problems: (1) Network designs of CNN and ViT are\ncompletely different, which leads to different semantic levels of intermediate\nfeatures, making spatial-wise knowledge transfer methods (e.g., feature\nmimicking) inefficient. (2) Distilling knowledge from CNN limits the network\nconvergence in the later training period since ViT's capability of integrating\nglobal information is suppressed by CNN's local-inductive-bias supervision. To\nthis end, we present Cumulative Spatial Knowledge Distillation (CSKD). CSKD\ndistills spatial-wise knowledge to all patch tokens of ViT from the\ncorresponding spatial responses of CNN, without introducing intermediate\nfeatures. Furthermore, CSKD exploits a Cumulative Knowledge Fusion (CKF)\nmodule, which introduces the global response of CNN and increasingly emphasizes\nits importance during the training. Applying CKF leverages CNN's local\ninductive bias in the early training period and gives full play to ViT's global\ncapability in the later one. Extensive experiments and analysis on ImageNet-1k\nand downstream datasets demonstrate the superiority of our CSKD. Code will be\npublicly available.\n","authors":["Borui Zhao","Renjie Song","Jiajun Liang"],"pdf_url":"https://arxiv.org/pdf/2307.08500v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08492v1","updated":"2023-07-17T13:55:31Z","published":"2023-07-17T13:55:31Z","title":"SVDFormer: Complementing Point Cloud via Self-view Augmentation and\n Self-structure Dual-generator","summary":" In this paper, we propose a novel network, SVDFormer, to tackle two specific\nchallenges in point cloud completion: understanding faithful global shapes from\nincomplete point clouds and generating high-accuracy local structures. Current\nmethods either perceive shape patterns using only 3D coordinates or import\nextra images with well-calibrated intrinsic parameters to guide the geometry\nestimation of the missing parts. However, these approaches do not always fully\nleverage the cross-modal self-structures available for accurate and\nhigh-quality point cloud completion. To this end, we first design a Self-view\nFusion Network that leverages multiple-view depth image information to observe\nincomplete self-shape and generate a compact global shape. To reveal highly\ndetailed structures, we then introduce a refinement module, called\nSelf-structure Dual-generator, in which we incorporate learned shape priors and\ngeometric self-similarities for producing new points. By perceiving the\nincompleteness of each point, the dual-path design disentangles refinement\nstrategies conditioned on the structural type of each point. SVDFormer absorbs\nthe wisdom of self-structures, avoiding any additional paired information such\nas color images with precisely calibrated camera intrinsic parameters.\nComprehensive experiments indicate that our method achieves state-of-the-art\nperformance on widely-used benchmarks. Code will be available at\nhttps://github.com/czvvd/SVDFormer.\n","authors":["Zhe Zhu","Honghua Chen","Xing He","Weiming Wang","Jing Qin","Mingqiang Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08492v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.08483v1","updated":"2023-07-17T13:44:11Z","published":"2023-07-17T13:44:11Z","title":"Differentiable Transportation Pruning","summary":" Deep learning algorithms are increasingly employed at the edge. However, edge\ndevices are resource constrained and thus require efficient deployment of deep\nneural networks. Pruning methods are a key tool for edge deployment as they can\nimprove storage, compute, memory bandwidth, and energy usage. In this paper we\npropose a novel accurate pruning technique that allows precise control over the\noutput network size. Our method uses an efficient optimal transportation scheme\nwhich we make end-to-end differentiable and which automatically tunes the\nexploration-exploitation behavior of the algorithm to find accurate sparse\nsub-networks. We show that our method achieves state-of-the-art performance\ncompared to previous pruning methods on 3 different datasets, using 5 different\nmodels, across a wide range of pruning ratios, and with two types of sparsity\nbudgets and pruning granularities.\n","authors":["Yunqiang Li","Jan C. van Gemert","Torsten Hoefler","Bert Moons","Evangelos Eleftheriou","Bram-Ernst Verhoef"],"pdf_url":"https://arxiv.org/pdf/2307.08483v1.pdf","comment":"ICCV 2023. arXiv admin note: text overlap with arXiv:2002.10179 by\n other authors"},{"id":"http://arxiv.org/abs/2307.08476v1","updated":"2023-07-17T13:33:11Z","published":"2023-07-17T13:33:11Z","title":"SkeletonMAE: Graph-based Masked Autoencoder for Skeleton Sequence\n Pre-training","summary":" Skeleton sequence representation learning has shown great advantages for\naction recognition due to its promising ability to model human joints and\ntopology. However, the current methods usually require sufficient labeled data\nfor training computationally expensive models, which is labor-intensive and\ntime-consuming. Moreover, these methods ignore how to utilize the fine-grained\ndependencies among different skeleton joints to pre-train an efficient skeleton\nsequence learning model that can generalize well across different datasets. In\nthis paper, we propose an efficient skeleton sequence learning framework, named\nSkeleton Sequence Learning (SSL). To comprehensively capture the human pose and\nobtain discriminative skeleton sequence representation, we build an asymmetric\ngraph-based encoder-decoder pre-training architecture named SkeletonMAE, which\nembeds skeleton joint sequence into Graph Convolutional Network (GCN) and\nreconstructs the masked skeleton joints and edges based on the prior human\ntopology knowledge. Then, the pre-trained SkeletonMAE encoder is integrated\nwith the Spatial-Temporal Representation Learning (STRL) module to build the\nSSL framework. Extensive experimental results show that our SSL generalizes\nwell across different datasets and outperforms the state-of-the-art\nself-supervised skeleton-based action recognition methods on FineGym, Diving48,\nNTU 60 and NTU 120 datasets. Additionally, we obtain comparable performance to\nsome fully supervised methods. The code is avaliable at\nhttps://github.com/HongYan1123/SkeletonMAE.\n","authors":["Hong Yan","Yang Liu","Yushen Wei","Zhen Li","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2307.08476v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08473v1","updated":"2023-07-17T13:28:58Z","published":"2023-07-17T13:28:58Z","title":"EGE-UNet: an Efficient Group Enhanced UNet for skin lesion segmentation","summary":" Transformer and its variants have been widely used for medical image\nsegmentation. However, the large number of parameter and computational load of\nthese models make them unsuitable for mobile health applications. To address\nthis issue, we propose a more efficient approach, the Efficient Group Enhanced\nUNet (EGE-UNet). We incorporate a Group multi-axis Hadamard Product Attention\nmodule (GHPA) and a Group Aggregation Bridge module (GAB) in a lightweight\nmanner. The GHPA groups input features and performs Hadamard Product Attention\nmechanism (HPA) on different axes to extract pathological information from\ndiverse perspectives. The GAB effectively fuses multi-scale information by\ngrouping low-level features, high-level features, and a mask generated by the\ndecoder at each stage. Comprehensive experiments on the ISIC2017 and ISIC2018\ndatasets demonstrate that EGE-UNet outperforms existing state-of-the-art\nmethods. In short, compared to the TransFuse, our model achieves superior\nsegmentation performance while reducing parameter and computation costs by 494x\nand 160x, respectively. Moreover, to our best knowledge, this is the first\nmodel with a parameter count limited to just 50KB. Our code is available at\nhttps://github.com/JCruan519/EGE-UNet.\n","authors":["Jiacheng Ruan","Mingye Xie","Jingsheng Gao","Ting Liu","Yuzhuo Fu"],"pdf_url":"https://arxiv.org/pdf/2307.08473v1.pdf","comment":"10 pages, 4 figures, 2 tables. This paper has been early accepted by\n MICCAI 2023 and has received the MICCAI Student-Author Registration (STAR)\n Award"},{"id":"http://arxiv.org/abs/2307.08467v1","updated":"2023-07-17T13:21:28Z","published":"2023-07-17T13:21:28Z","title":"Riesz feature representation: scale equivariant scattering network for\n classification tasks","summary":" Scattering networks yield powerful and robust hierarchical image descriptors\nwhich do not require lengthy training and which work well with very few\ntraining data. However, they rely on sampling the scale dimension. Hence, they\nbecome sensitive to scale variations and are unable to generalize to unseen\nscales. In this work, we define an alternative feature representation based on\nthe Riesz transform. We detail and analyze the mathematical foundations behind\nthis representation. In particular, it inherits scale equivariance from the\nRiesz transform and completely avoids sampling of the scale dimension.\nAdditionally, the number of features in the representation is reduced by a\nfactor four compared to scattering networks. Nevertheless, our representation\nperforms comparably well for texture classification with an interesting\naddition: scale equivariance. Our method yields superior performance when\ndealing with scales outside of those covered by the training dataset. The\nusefulness of the equivariance property is demonstrated on the digit\nclassification task, where accuracy remains stable even for scales four times\nlarger than the one chosen for training. As a second example, we consider\nclassification of textures.\n","authors":["Tin Barisin","Jesus Angulo","Katja Schladitz","Claudia Redenbach"],"pdf_url":"https://arxiv.org/pdf/2307.08467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08466v1","updated":"2023-07-17T13:21:02Z","published":"2023-07-17T13:21:02Z","title":"Classification of UHF Partial Discharge Signals in Gas-Insulated HVDC\n Systems Using Neural Networks","summary":" Undetected partial discharges (PDs) are a safety critical issue in high\nvoltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC\nvoltage is well-established, the analysis of PDs under DC voltage remains an\nactive research field. A key focus of these investigations is the\nclassification of different PD sources to enable subsequent sophisticated\nanalysis.\n In this paper, we propose and analyze a neural network-based approach for\nclassifying PD signals caused by metallic protrusions and conductive particles\non the insulator of HVDC GIS, without relying on pulse sequence analysis\nfeatures. In contrast to previous approaches, our proposed model can\ndiscriminate the studied PD signals obtained at negative and positive\npotentials, while also generalizing to unseen operating voltage multiples.\nAdditionally, we compare the performance of time- and frequency-domain input\nsignals and explore the impact of different normalization schemes to mitigate\nthe influence of free-space path loss between the sensor and defect location.\n","authors":["Steffen Seitz","Thomas Götz","Christopher Lindenberg","Ronald Tetzlaff","Stephan Schlegel"],"pdf_url":"https://arxiv.org/pdf/2307.08466v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.08456v1","updated":"2023-07-17T12:57:02Z","published":"2023-07-17T12:57:02Z","title":"Domain Adaptation using Silver Standard Masks for Lateral Ventricle\n Segmentation in FLAIR MRI","summary":" Lateral ventricular volume (LVV) is an important biomarker for clinical\ninvestigation. We present the first transfer learning-based LVV segmentation\nmethod for fluid-attenuated inversion recovery (FLAIR) MRI. To mitigate\ncovariate shifts between source and target domains, this work proposes an\ndomain adaptation method that optimizes performance on three target datasets.\nSilver standard (SS) masks were generated from the target domain using a novel\nconventional image processing ventricular segmentation algorithm and used to\nsupplement the gold standard (GS) data from the source domain, Canadian\nAtherosclerosis Imaging Network (CAIN). Four models were tested on held-out\ntest sets from four datasets: 1) SS+GS: trained on target SS masks and\nfine-tuned on source GS masks, 2) GS+SS: trained on source GS masks and\nfine-tuned on target SS masks, 3) trained on source GS (GS CAIN Only) and 4)\ntrained on target SS masks (SS Only). The SS+GS model had the best and most\nconsistent performance (mean DSC = 0.89, CoV = 0.05) and showed significantly\n(p < 0.05) higher DSC compared to the GS-only model on three target domains.\nResults suggest pre-training with noisy labels from the target domain allows\nthe model to adapt to the dataset-specific characteristics and provides robust\nparameter initialization while fine-tuning with GS masks allows the model to\nlearn detailed features. This method has wide application to other medical\nimaging problems where labeled data is scarce, and can be used as a per-dataset\ncalibration method to accelerate wide-scale adoption.\n","authors":["Owen Crystal","Pejman J. Maralani","Sandra Black","Alan R. Moody","April Khademi"],"pdf_url":"https://arxiv.org/pdf/2307.08456v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.08448v1","updated":"2023-07-17T12:42:56Z","published":"2023-07-17T12:42:56Z","title":"Not All Steps are Created Equal: Selective Diffusion Distillation for\n Image Manipulation","summary":" Conditional diffusion models have demonstrated impressive performance in\nimage manipulation tasks. The general pipeline involves adding noise to the\nimage and then denoising it. However, this method faces a trade-off problem:\nadding too much noise affects the fidelity of the image while adding too little\naffects its editability. This largely limits their practical applicability. In\nthis paper, we propose a novel framework, Selective Diffusion Distillation\n(SDD), that ensures both the fidelity and editability of images. Instead of\ndirectly editing images with a diffusion model, we train a feedforward image\nmanipulation network under the guidance of the diffusion model. Besides, we\npropose an effective indicator to select the semantic-related timestep to\nobtain the correct semantic guidance from the diffusion model. This approach\nsuccessfully avoids the dilemma caused by the diffusion process. Our extensive\nexperiments demonstrate the advantages of our framework. Code is released at\nhttps://github.com/AndysonYs/Selective-Diffusion-Distillation.\n","authors":["Luozhou Wang","Shuai Yang","Shu Liu","Ying-cong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.08448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08436v1","updated":"2023-07-17T12:31:13Z","published":"2023-07-17T12:31:13Z","title":"DOT: A Distillation-Oriented Trainer","summary":" Knowledge distillation transfers knowledge from a large model to a small one\nvia task and distillation losses. In this paper, we observe a trade-off between\ntask and distillation losses, i.e., introducing distillation loss limits the\nconvergence of task loss. We believe that the trade-off results from the\ninsufficient optimization of distillation loss. The reason is: The teacher has\na lower task loss than the student, and a lower distillation loss drives the\nstudent more similar to the teacher, then a better-converged task loss could be\nobtained. To break the trade-off, we propose the Distillation-Oriented Trainer\n(DOT). DOT separately considers gradients of task and distillation losses, then\napplies a larger momentum to distillation loss to accelerate its optimization.\nWe empirically prove that DOT breaks the trade-off, i.e., both losses are\nsufficiently optimized. Extensive experiments validate the superiority of DOT.\nNotably, DOT achieves a +2.59% accuracy improvement on ImageNet-1k for the\nResNet50-MobileNetV1 pair. Conclusively, DOT greatly benefits the student's\noptimization properties in terms of loss convergence and model generalization.\nCode will be made publicly available.\n","authors":["Borui Zhao","Quan Cui","Renjie Song","Jiajun Liang"],"pdf_url":"https://arxiv.org/pdf/2307.08436v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08434v1","updated":"2023-07-17T12:27:15Z","published":"2023-07-17T12:27:15Z","title":"Dense Affinity Matching for Few-Shot Segmentation","summary":" Few-Shot Segmentation (FSS) aims to segment the novel class images with a few\nannotated samples. In this paper, we propose a dense affinity matching (DAM)\nframework to exploit the support-query interaction by densely capturing both\nthe pixel-to-pixel and pixel-to-patch relations in each support-query pair with\nthe bidirectional 3D convolutions. Different from the existing methods that\nremove the support background, we design a hysteretic spatial filtering module\n(HSFM) to filter the background-related query features and retain the\nforeground-related query features with the assistance of the support\nbackground, which is beneficial for eliminating interference objects in the\nquery background. We comprehensively evaluate our DAM on ten benchmarks under\ncross-category, cross-dataset, and cross-domain FSS tasks. Experimental results\ndemonstrate that DAM performs very competitively under different settings with\nonly 0.68M parameters, especially under cross-domain FSS tasks, showing its\neffectiveness and efficiency.\n","authors":["Hao Chen","Yonghan Dong","Zheming Lu","Yunlong Yu","Yingming Li","Jungong Han","Zhongfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13005v2","updated":"2023-07-17T12:22:21Z","published":"2023-03-23T02:59:36Z","title":"From Knowledge Distillation to Self-Knowledge Distillation: A Unified\n Approach with Normalized Loss and Customized Soft Labels","summary":" Knowledge Distillation (KD) uses the teacher's prediction logits as soft\nlabels to guide the student, while self-KD does not need a real teacher to\nrequire the soft labels. This work unifies the formulations of the two tasks by\ndecomposing and reorganizing the generic KD loss into a Normalized KD (NKD)\nloss and customized soft labels for both target class (image's category) and\nnon-target classes named Universal Self-Knowledge Distillation (USKD). We\ndecompose the KD loss and find the non-target loss from it forces the student's\nnon-target logits to match the teacher's, but the sum of the two non-target\nlogits is different, preventing them from being identical. NKD normalizes the\nnon-target logits to equalize their sum. It can be generally used for KD and\nself-KD to better use the soft labels for distillation loss. USKD generates\ncustomized soft labels for both target and non-target classes without a\nteacher. It smooths the target logit of the student as the soft target label\nand uses the rank of the intermediate feature to generate the soft non-target\nlabels with Zipf's law. For KD with teachers, our NKD achieves state-of-the-art\nperformance on CIFAR-100 and ImageNet datasets, boosting the ImageNet Top-1\naccuracy of ResNet18 from 69.90% to 71.96% with a ResNet-34 teacher. For\nself-KD without teachers, USKD is the first self-KD method that can be\neffectively applied to both CNN and ViT models with negligible additional time\nand memory cost, resulting in new state-of-the-art results, such as 1.17% and\n0.55% accuracy gains on ImageNet for MobileNet and DeiT-Tiny, respectively. Our\ncodes are available at https://github.com/yzd-v/cls_KD.\n","authors":["Zhendong Yang","Ailing Zeng","Zhe Li","Tianke Zhang","Chun Yuan","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2303.13005v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2302.10893v3","updated":"2023-07-17T12:13:46Z","published":"2023-02-07T18:25:28Z","title":"Fair Diffusion: Instructing Text-to-Image Generation Models on Fairness","summary":" Generative AI models have recently achieved astonishing results in quality\nand are consequently employed in a fast-growing number of applications.\nHowever, since they are highly data-driven, relying on billion-sized datasets\nrandomly scraped from the internet, they also suffer from degenerated and\nbiased human behavior, as we demonstrate. In fact, they may even reinforce such\nbiases. To not only uncover but also combat these undesired effects, we present\na novel strategy, called Fair Diffusion, to attenuate biases after the\ndeployment of generative text-to-image models. Specifically, we demonstrate\nshifting a bias, based on human instructions, in any direction yielding\narbitrarily new proportions for, e.g., identity groups. As our empirical\nevaluation demonstrates, this introduced control enables instructing generative\nimage models on fairness, with no data filtering and additional training\nrequired.\n","authors":["Felix Friedrich","Manuel Brack","Lukas Struppek","Dominik Hintersdorf","Patrick Schramowski","Sasha Luccioni","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2302.10893v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08417v1","updated":"2023-07-17T11:57:04Z","published":"2023-07-17T11:57:04Z","title":"Divide&Classify: Fine-Grained Classification for City-Wide Visual Place\n Recognition","summary":" Visual Place recognition is commonly addressed as an image retrieval problem.\nHowever, retrieval methods are impractical to scale to large datasets, densely\nsampled from city-wide maps, since their dimension impact negatively on the\ninference time. Using approximate nearest neighbour search for retrieval helps\nto mitigate this issue, at the cost of a performance drop. In this paper we\ninvestigate whether we can effectively approach this task as a classification\nproblem, thus bypassing the need for a similarity search. We find that existing\nclassification methods for coarse, planet-wide localization are not suitable\nfor the fine-grained and city-wide setting. This is largely due to how the\ndataset is split into classes, because these methods are designed to handle a\nsparse distribution of photos and as such do not consider the visual aliasing\nproblem across neighbouring classes that naturally arises in dense scenarios.\nThus, we propose a partitioning scheme that enables a fast and accurate\ninference, preserving a simple learning procedure, and a novel inference\npipeline based on an ensemble of novel classifiers that uses the prototypes\nlearned via an angular margin loss. Our method, Divide&Classify (D&C), enjoys\nthe fast inference of classification solutions and an accuracy competitive with\nretrieval methods on the fine-grained, city-wide setting. Moreover, we show\nthat D&C can be paired with existing retrieval pipelines to speed up\ncomputations by over 20 times while increasing their recall, leading to new\nstate-of-the-art results.\n","authors":["Gabriele Trivigno","Gabriele Berton","Carlo Masone","Juan Aragon","Barbara Caputo"],"pdf_url":"https://arxiv.org/pdf/2307.08417v1.pdf","comment":"Accepted to ICCV23"},{"id":"http://arxiv.org/abs/2307.08415v1","updated":"2023-07-17T11:55:27Z","published":"2023-07-17T11:55:27Z","title":"Monocular 3D Object Detection with LiDAR Guided Semi Supervised Active\n Learning","summary":" We propose a novel semi-supervised active learning (SSAL) framework for\nmonocular 3D object detection with LiDAR guidance (MonoLiG), which leverages\nall modalities of collected data during model development. We utilize LiDAR to\nguide the data selection and training of monocular 3D detectors without\nintroducing any overhead in the inference phase. During training, we leverage\nthe LiDAR teacher, monocular student cross-modal framework from semi-supervised\nlearning to distill information from unlabeled data as pseudo-labels. To handle\nthe differences in sensor characteristics, we propose a data noise-based\nweighting mechanism to reduce the effect of propagating noise from LiDAR\nmodality to monocular. For selecting which samples to label to improve the\nmodel performance, we propose a sensor consistency-based selection score that\nis also coherent with the training objective. Extensive experimental results on\nKITTI and Waymo datasets verify the effectiveness of our proposed framework. In\nparticular, our selection strategy consistently outperforms state-of-the-art\nactive learning baselines, yielding up to 17% better saving rate in labeling\ncosts. Our training strategy attains the top place in KITTI 3D and\nbirds-eye-view (BEV) monocular object detection official benchmarks by\nimproving the BEV Average Precision (AP) by 2.02.\n","authors":["Aral Hekimoglu","Michael Schmidt","Alvaro Marcos-Ramiro"],"pdf_url":"https://arxiv.org/pdf/2307.08415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08414v1","updated":"2023-07-17T11:55:20Z","published":"2023-07-17T11:55:20Z","title":"Active Learning for Object Detection with Non-Redundant Informative\n Sampling","summary":" Curating an informative and representative dataset is essential for enhancing\nthe performance of 2D object detectors. We present a novel active learning\nsampling strategy that addresses both the informativeness and diversity of the\nselections. Our strategy integrates uncertainty and diversity-based selection\nprinciples into a joint selection objective by measuring the collective\ninformation score of the selected samples. Specifically, our proposed NORIS\nalgorithm quantifies the impact of training with a sample on the\ninformativeness of other similar samples. By exclusively selecting samples that\nare simultaneously informative and distant from other highly informative\nsamples, we effectively avoid redundancy while maintaining a high level of\ninformativeness. Moreover, instead of utilizing whole image features to\ncalculate distances between samples, we leverage features extracted from\ndetected object regions within images to define object features. This allows us\nto construct a dataset encompassing diverse object types, shapes, and angles.\nExtensive experiments on object detection and image classification tasks\ndemonstrate the effectiveness of our strategy over the state-of-the-art\nbaselines. Specifically, our selection strategy achieves a 20% and 30%\nreduction in labeling costs compared to random selection for PASCAL-VOC and\nKITTI, respectively.\n","authors":["Aral Hekimoglu","Adrian Brucker","Alper Kagan Kayali","Michael Schmidt","Alvaro Marcos-Ramiro"],"pdf_url":"https://arxiv.org/pdf/2307.08414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08397v1","updated":"2023-07-17T11:29:48Z","published":"2023-07-17T11:29:48Z","title":"CLIP-Guided StyleGAN Inversion for Text-Driven Real Image Editing","summary":" Researchers have recently begun exploring the use of StyleGAN-based models\nfor real image editing. One particularly interesting application is using\nnatural language descriptions to guide the editing process. Existing approaches\nfor editing images using language either resort to instance-level latent code\noptimization or map predefined text prompts to some editing directions in the\nlatent space. However, these approaches have inherent limitations. The former\nis not very efficient, while the latter often struggles to effectively handle\nmulti-attribute changes. To address these weaknesses, we present CLIPInverter,\na new text-driven image editing approach that is able to efficiently and\nreliably perform multi-attribute changes. The core of our method is the use of\nnovel, lightweight text-conditioned adapter layers integrated into pretrained\nGAN-inversion networks. We demonstrate that by conditioning the initial\ninversion step on the CLIP embedding of the target description, we are able to\nobtain more successful edit directions. Additionally, we use a CLIP-guided\nrefinement step to make corrections in the resulting residual latent codes,\nwhich further improves the alignment with the text prompt. Our method\noutperforms competing approaches in terms of manipulation accuracy and\nphoto-realism on various domains including human faces, cats, and birds, as\nshown by our qualitative and quantitative results.\n","authors":["Ahmet Canberk Baykal","Abdul Basit Annes","Duygu Ceylan","Erkut Erdem","Aykut Erdem","Deniz Yurt"],"pdf_url":"https://arxiv.org/pdf/2307.08397v1.pdf","comment":"Accepted for publication in ACM Transactions on Graphics"},{"id":"http://arxiv.org/abs/2307.08388v1","updated":"2023-07-17T10:55:58Z","published":"2023-07-17T10:55:58Z","title":"Dynamic Snake Convolution based on Topological Geometric Constraints for\n Tubular Structure Segmentation","summary":" Accurate segmentation of topological tubular structures, such as blood\nvessels and roads, is crucial in various fields, ensuring accuracy and\nefficiency in downstream tasks. However, many factors complicate the task,\nincluding thin local structures and variable global morphologies. In this work,\nwe note the specificity of tubular structures and use this knowledge to guide\nour DSCNet to simultaneously enhance perception in three stages: feature\nextraction, feature fusion, and loss constraint. First, we propose a dynamic\nsnake convolution to accurately capture the features of tubular structures by\nadaptively focusing on slender and tortuous local structures. Subsequently, we\npropose a multi-view feature fusion strategy to complement the attention to\nfeatures from multiple perspectives during feature fusion, ensuring the\nretention of important information from different global morphologies. Finally,\na continuity constraint loss function, based on persistent homology, is\nproposed to constrain the topological continuity of the segmentation better.\nExperiments on 2D and 3D datasets show that our DSCNet provides better accuracy\nand continuity on the tubular structure segmentation task compared with several\nmethods. Our codes will be publicly available.\n","authors":["Yaolei Qi","Yuting He","Xiaoming Qi","Yuan Zhang","Guanyu Yang"],"pdf_url":"https://arxiv.org/pdf/2307.08388v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2209.05379v4","updated":"2023-07-17T10:53:03Z","published":"2022-09-12T16:31:34Z","title":"Action-based Early Autism Diagnosis Using Contrastive Feature Learning","summary":" Autism, also known as Autism Spectrum Disorder (or ASD), is a neurological\ndisorder. Its main symptoms include difficulty in (verbal and/or non-verbal)\ncommunication, and rigid/repetitive behavior. These symptoms are often\nindistinguishable from a normal (control) individual, due to which this\ndisorder remains undiagnosed in early childhood leading to delayed treatment.\nSince the learning curve is steep during the initial age, an early diagnosis of\nautism could allow to take adequate interventions at the right time, which\nmight positively affect the growth of an autistic child. Further, the\ntraditional methods of autism diagnosis require multiple visits to a\nspecialized psychiatrist, however this process can be time-consuming. In this\npaper, we present a learning based approach to automate autism diagnosis using\nsimple and small action video clips of subjects. This task is particularly\nchallenging because the amount of annotated data available is small, and the\nvariations among samples from the two categories (ASD and control) are\ngenerally indistinguishable. This is also evident from poor performance of a\nbinary classifier learned using the cross-entropy loss on top of a baseline\nencoder. To address this, we adopt contrastive feature learning in both self\nsupervised and supervised learning frameworks, and show that these can lead to\na significant increase in the prediction accuracy of a binary classifier on\nthis task. We further validate this by conducting thorough experimental\nanalyses under different set-ups on two publicly available datasets.\n","authors":["Asha Rani","Pankaj Yadav","Yashaswi Verma"],"pdf_url":"https://arxiv.org/pdf/2209.05379v4.pdf","comment":"This preprint has not undergone peer review (when applicable) or any\n postsubmission improvements or corrections. The Version of Record of this\n article is published in Multimedia Systems (2023), and is available online at\n https://doi.org/10.1007/s00530-023-01132-8"},{"id":"http://arxiv.org/abs/2307.08383v1","updated":"2023-07-17T10:43:54Z","published":"2023-07-17T10:43:54Z","title":"Distributed bundle adjustment with block-based sparse matrix compression\n for super large scale datasets","summary":" We propose a distributed bundle adjustment (DBA) method using the exact\nLevenberg-Marquardt (LM) algorithm for super large-scale datasets. Most of the\nexisting methods partition the global map to small ones and conduct bundle\nadjustment in the submaps. In order to fit the parallel framework, they use\napproximate solutions instead of the LM algorithm. However, those methods often\ngive sub-optimal results. Different from them, we utilize the exact LM\nalgorithm to conduct global bundle adjustment where the formation of the\nreduced camera system (RCS) is actually parallelized and executed in a\ndistributed way. To store the large RCS, we compress it with a block-based\nsparse matrix compression format (BSMC), which fully exploits its block\nfeature. The BSMC format also enables the distributed storage and updating of\nthe global RCS. The proposed method is extensively evaluated and compared with\nthe state-of-the-art pipelines using both synthetic and real datasets.\nPreliminary results demonstrate the efficient memory usage and vast scalability\nof the proposed method compared with the baselines. For the first time, we\nconducted parallel bundle adjustment using LM algorithm on a real datasets with\n1.18 million images and a synthetic dataset with 10 million images (about 500\ntimes that of the state-of-the-art LM-based BA) on a distributed computing\nsystem.\n","authors":["Maoteng Zheng","Nengcheng Chen","Junfeng Zhu","Xiaoru Zeng","Huanbin Qiu","Yuyao Jiang","Xingyue Lu","Hao Qu"],"pdf_url":"https://arxiv.org/pdf/2307.08383v1.pdf","comment":"accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.04628v2","updated":"2023-07-17T10:35:00Z","published":"2023-05-08T11:10:25Z","title":"Target-driven One-Shot Unsupervised Domain Adaptation","summary":" In this paper, we introduce a novel framework for the challenging problem of\nOne-Shot Unsupervised Domain Adaptation (OSUDA), which aims to adapt to a\ntarget domain with only a single unlabeled target sample. Unlike existing\napproaches that rely on large labeled source and unlabeled target data, our\nTarget-driven One-Shot UDA (TOS-UDA) approach employs a learnable augmentation\nstrategy guided by the target sample's style to align the source distribution\nwith the target distribution. Our method consists of three modules: an\naugmentation module, a style alignment module, and a classifier. Unlike\nexisting methods, our augmentation module allows for strong transformations of\nthe source samples, and the style of the single target sample available is\nexploited to guide the augmentation by ensuring perceptual similarity.\nFurthermore, our approach integrates augmentation with style alignment,\neliminating the need for separate pre-training on additional datasets. Our\nmethod outperforms or performs comparably to existing OS-UDA methods on the\nDigits and DomainNet benchmarks.\n","authors":["Julio Ivan Davila Carrazco","Suvarna Kishorkumar Kadam","Pietro Morerio","Alessio Del Bue","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2305.04628v2.pdf","comment":"Accepted to 22nd International Conference on IMAGE ANALYSIS AND\n PROCESSING (ICIAP) 2023"},{"id":"http://arxiv.org/abs/2303.04654v2","updated":"2023-07-17T10:04:35Z","published":"2023-03-08T15:21:33Z","title":"Aberration-Aware Depth-from-Focus","summary":" Computer vision methods for depth estimation usually use simple camera models\nwith idealized optics. For modern machine learning approaches, this creates an\nissue when attempting to train deep networks with simulated data, especially\nfor focus-sensitive tasks like Depth-from-Focus. In this work, we investigate\nthe domain gap caused by off-axis aberrations that will affect the decision of\nthe best-focused frame in a focal stack. We then explore bridging this domain\ngap through aberration-aware training (AAT). Our approach involves a\nlightweight network that models lens aberrations at different positions and\nfocus distances, which is then integrated into the conventional network\ntraining pipeline. We evaluate the generality of pretrained models on both\nsynthetic and real-world data. Our experimental results demonstrate that the\nproposed AAT scheme can improve depth estimation accuracy without fine-tuning\nthe model or modifying the network architecture.\n","authors":["Xinge Yang","Qiang Fu","Mohammed Elhoseiny","Wolfgang Heidrich"],"pdf_url":"https://arxiv.org/pdf/2303.04654v2.pdf","comment":"[ICCP & TPAMI 2023] Considering optical aberrations during network\n training can improve the generalizability"},{"id":"http://arxiv.org/abs/2307.08357v1","updated":"2023-07-17T09:50:03Z","published":"2023-07-17T09:50:03Z","title":"Self-supervised Monocular Depth Estimation: Let's Talk About The Weather","summary":" Current, self-supervised depth estimation architectures rely on clear and\nsunny weather scenes to train deep neural networks. However, in many locations,\nthis assumption is too strong. For example in the UK (2021), 149 days consisted\nof rain. For these architectures to be effective in real-world applications, we\nmust create models that can generalise to all weather conditions, times of the\nday and image qualities. Using a combination of computer graphics and\ngenerative models, one can augment existing sunny-weather data in a variety of\nways that simulate adverse weather effects. While it is tempting to use such\ndata augmentations for self-supervised depth, in the past this was shown to\ndegrade performance instead of improving it. In this paper, we put forward a\nmethod that uses augmentations to remedy this problem. By exploiting the\ncorrespondence between unaugmented and augmented data we introduce a\npseudo-supervised loss for both depth and pose estimation. This brings back\nsome of the benefits of supervised learning while still not requiring any\nlabels. We also make a series of practical recommendations which collectively\noffer a reliable, efficient framework for weather-related augmentation of\nself-supervised depth from monocular video. We present extensive testing to\nshow that our method, Robust-Depth, achieves SotA performance on the KITTI\ndataset while significantly surpassing SotA on challenging, adverse condition\ndata such as DrivingStereo, Foggy CityScape and NuScenes-Night. The project\nwebsite can be found here https://kieran514.github.io/Robust-Depth-Project/.\n","authors":["Kieran Saunders","George Vogiatzis","Luis Manso"],"pdf_url":"https://arxiv.org/pdf/2307.08357v1.pdf","comment":"ICCV'23"},{"id":"http://arxiv.org/abs/2307.07125v2","updated":"2023-07-17T09:47:49Z","published":"2023-07-14T02:26:05Z","title":"CeRF: Convolutional Neural Radiance Fields for New View Synthesis with\n Derivatives of Ray Modeling","summary":" In recent years, novel view synthesis has gained popularity in generating\nhigh-fidelity images. While demonstrating superior performance in the task of\nsynthesizing novel views, the majority of these methods are still based on the\nconventional multi-layer perceptron for scene embedding. Furthermore, light\nfield models suffer from geometric blurring during pixel rendering, while\nradiance field-based volume rendering methods have multiple solutions for a\ncertain target of density distribution integration. To address these issues, we\nintroduce the Convolutional Neural Radiance Fields to model the derivatives of\nradiance along rays. Based on 1D convolutional operations, our proposed method\neffectively extracts potential ray representations through a structured neural\nnetwork architecture. Besides, with the proposed ray modeling, a proposed\nrecurrent module is employed to solve geometric ambiguity in the fully neural\nrendering process. Extensive experiments demonstrate the promising results of\nour proposed model compared with existing state-of-the-art methods.\n","authors":["Xiaoyan Yang","Dingbo Lu","Yang Li","Chenhui Li","Changbo Wang"],"pdf_url":"https://arxiv.org/pdf/2307.07125v2.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.08353v1","updated":"2023-07-17T09:45:19Z","published":"2023-07-17T09:45:19Z","title":"Box-DETR: Understanding and Boxing Conditional Spatial Queries","summary":" Conditional spatial queries are recently introduced into DEtection\nTRansformer (DETR) to accelerate convergence. In DAB-DETR, such queries are\nmodulated by the so-called conditional linear projection at each decoder stage,\naiming to search for positions of interest such as the four extremities of the\nbox. Each decoder stage progressively updates the box by predicting the anchor\nbox offsets, while in cross-attention only the box center is informed as the\nreference point. The use of only box center, however, leaves the width and\nheight of the previous box unknown to the current stage, which hinders accurate\nprediction of offsets. We argue that the explicit use of the entire box\ninformation in cross-attention matters. In this work, we propose Box Agent to\ncondense the box into head-specific agent points. By replacing the box center\nwith the agent point as the reference point in each head, the conditional\ncross-attention can search for positions from a more reasonable starting point\nby considering the full scope of the previous box, rather than always from the\nprevious box center. This significantly reduces the burden of the conditional\nlinear projection. Experimental results show that the box agent leads to not\nonly faster convergence but also improved detection performance, e.g., our\nsingle-scale model achieves $44.2$ AP with ResNet-50 based on DAB-DETR. Our Box\nAgent requires minor modifications to the code and has negligible computational\nworkload. Code is available at https://github.com/tiny-smart/box-detr.\n","authors":["Wenze Liu","Hao Lu","Yuliang Liu","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2307.08353v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2307.08351v1","updated":"2023-07-17T09:41:01Z","published":"2023-07-17T09:41:01Z","title":"Neural Modulation Fields for Conditional Cone Beam Neural Tomography","summary":" Conventional Computed Tomography (CT) methods require large numbers of\nnoise-free projections for accurate density reconstructions, limiting their\napplicability to the more complex class of Cone Beam Geometry CT (CBCT)\nreconstruction. Recently, deep learning methods have been proposed to overcome\nthese limitations, with methods based on neural fields (NF) showing strong\nperformance, by approximating the reconstructed density through a\ncontinuous-in-space coordinate based neural network. Our focus is on improving\nsuch methods, however, unlike previous work, which requires training an NF from\nscratch for each new set of projections, we instead propose to leverage\nanatomical consistencies over different scans by training a single conditional\nNF on a dataset of projections. We propose a novel conditioning method where\nlocal modulations are modeled per patient as a field over the input domain\nthrough a Neural Modulation Field (NMF). The resulting Conditional Cone Beam\nNeural Tomography (CondCBNT) shows improved performance for both high and low\nnumbers of available projections on noise-free and noisy data.\n","authors":["Samuele Papa","David M. Knigge","Riccardo Valperga","Nikita Moriakov","Miltos Kofinas","Jan-Jakob Sonke","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2307.08351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08348v1","updated":"2023-07-17T09:40:02Z","published":"2023-07-17T09:40:02Z","title":"Adaptive Local Basis Functions for Shape Completion","summary":" In this paper, we focus on the task of 3D shape completion from partial point\nclouds using deep implicit functions. Existing methods seek to use voxelized\nbasis functions or the ones from a certain family of functions (e.g.,\nGaussians), which leads to high computational costs or limited shape\nexpressivity. On the contrary, our method employs adaptive local basis\nfunctions, which are learned end-to-end and not restricted in certain forms.\nBased on those basis functions, a local-to-local shape completion framework is\npresented. Our algorithm learns sparse parameterization with a small number of\nbasis functions while preserving local geometric details during completion.\nQuantitative and qualitative experiments demonstrate that our method\noutperforms the state-of-the-art methods in shape completion, detail\npreservation, generalization to unseen geometries, and computational cost. Code\nand data are at https://github.com/yinghdb/Adaptive-Local-Basis-Functions.\n","authors":["Hui Ying","Tianjia Shao","He Wang","Yin Yang","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.08348v1.pdf","comment":"In SIGGRAPH 2023"},{"id":"http://arxiv.org/abs/2307.08347v1","updated":"2023-07-17T09:38:41Z","published":"2023-07-17T09:38:41Z","title":"M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models\n and Latent Space Geometry Optimization","summary":" Medical vision-language models enable co-learning and integrating features\nfrom medical imaging and clinical text. However, these models are not easy to\ntrain and the latent representation space can be complex. Here we propose a\nnovel way for pre-training and regularising medical vision-language models. The\nproposed method, named Medical vision-language pre-training with Frozen\nlanguage models and Latent spAce Geometry optimization (M-FLAG), leverages a\nfrozen language model for training stability and efficiency and introduces a\nnovel orthogonality loss to harmonize the latent space geometry. We demonstrate\nthe potential of the pre-trained model on three downstream tasks: medical image\nclassification, segmentation, and object detection. Extensive experiments\nacross five public datasets demonstrate that M-FLAG significantly outperforms\nexisting medical vision-language pre-training approaches and reduces the number\nof parameters by 78\\%. Notably, M-FLAG achieves outstanding performance on the\nsegmentation task while using only 1\\% of the RSNA dataset, even outperforming\nImageNet pre-trained models that have been fine-tuned using 100\\% of the data.\n","authors":["Che Liu","Sibo Cheng","Chen Chen","Mengyun Qiao","Weitong Zhang","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2307.08347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06608v2","updated":"2023-07-17T09:34:34Z","published":"2023-02-13T18:59:52Z","title":"3D-aware Blending with Generative NeRFs","summary":" Image blending aims to combine multiple images seamlessly. It remains\nchallenging for existing 2D-based methods, especially when input images are\nmisaligned due to differences in 3D camera poses and object shapes. To tackle\nthese issues, we propose a 3D-aware blending method using generative Neural\nRadiance Fields (NeRF), including two key components: 3D-aware alignment and\n3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of\nthe reference image with respect to generative NeRFs and then perform 3D local\nalignment for each part. To further leverage 3D information of the generative\nNeRF, we propose 3D-aware blending that directly blends images on the NeRF's\nlatent representation space, rather than raw pixel space. Collectively, our\nmethod outperforms existing 2D baselines, as validated by extensive\nquantitative and qualitative evaluations with FFHQ and AFHQ-Cat.\n","authors":["Hyunsu Kim","Gayoung Lee","Yunjey Choi","Jin-Hwa Kim","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06608v2.pdf","comment":"ICCV 2023, Project page: https://blandocs.github.io/blendnerf"},{"id":"http://arxiv.org/abs/2307.08339v1","updated":"2023-07-17T09:26:13Z","published":"2023-07-17T09:26:13Z","title":"Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection","summary":" Accurate and robust object detection is critical for autonomous driving.\nImage-based detectors face difficulties caused by low visibility in adverse\nweather conditions. Thus, radar-camera fusion is of particular interest but\npresents challenges in optimally fusing heterogeneous data sources. To approach\nthis issue, we propose two new radar preprocessing techniques to better align\nradar and camera data. In addition, we introduce a Multi-Task Cross-Modality\nAttention-Fusion Network (MCAF-Net) for object detection, which includes two\nnew fusion blocks. These allow for exploiting information from the feature maps\nmore comprehensively. The proposed algorithm jointly detects objects and\nsegments free space, which guides the model to focus on the more relevant part\nof the scene, namely, the occupied space. Our approach outperforms current\nstate-of-the-art radar-camera fusion-based object detectors in the nuScenes\ndataset and achieves more robust results in adverse weather conditions and\nnighttime scenarios.\n","authors":["Huawei Sun","Hao Feng","Georg Stettinger","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2307.08339v1.pdf","comment":"Accepted by ITSC 2023"},{"id":"http://arxiv.org/abs/2211.16198v3","updated":"2023-07-17T09:24:49Z","published":"2022-11-28T16:48:41Z","title":"SuS-X: Training-Free Name-Only Transfer of Vision-Language Models","summary":" Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet\neffective way to train large-scale vision-language models. CLIP demonstrates\nimpressive zero-shot classification and retrieval on diverse downstream tasks.\nHowever, to leverage its full potential, fine-tuning still appears to be\nnecessary. Fine-tuning the entire CLIP model can be resource-intensive and\nunstable. Moreover, recent methods that aim to circumvent this need for\nfine-tuning still require access to images from the target distribution. In\nthis paper, we pursue a different approach and explore the regime of\ntraining-free \"name-only transfer\" in which the only knowledge we possess about\nthe downstream task comprises the names of downstream target categories. We\npropose a novel method, SuS-X, consisting of two key building blocks -- SuS and\nTIP-X, that requires neither intensive fine-tuning nor costly labelled data.\nSuS-X achieves state-of-the-art zero-shot classification results on 19\nbenchmark datasets. We further show the utility of TIP-X in the training-free\nfew-shot setting, where we again achieve state-of-the-art results over strong\ntraining-free baselines. Code is available at\nhttps://github.com/vishaal27/SuS-X.\n","authors":["Vishaal Udandarao","Ankush Gupta","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2211.16198v3.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2212.01450v2","updated":"2023-07-17T09:09:14Z","published":"2022-12-02T21:21:40Z","title":"Crowd Density Estimation using Imperfect Labels","summary":" Density estimation is one of the most widely used methods for crowd counting\nin which a deep learning model learns from head-annotated crowd images to\nestimate crowd density in unseen images. Typically, the learning performance of\nthe model is highly impacted by the accuracy of the annotations and inaccurate\nannotations may lead to localization and counting errors during prediction. A\nsignificant amount of works exist on crowd counting using perfectly labelled\ndatasets but none of these explore the impact of annotation errors on the model\naccuracy. In this paper, we investigate the impact of imperfect labels (both\nnoisy and missing labels) on crowd counting accuracy. We propose a system that\nautomatically generates imperfect labels using a deep learning model (called\nannotator) which are then used to train a new crowd counting model (target\nmodel). Our analysis on two crowd counting models and two benchmark datasets\nshows that the proposed scheme achieves accuracy closer to that of the model\ntrained with perfect labels showing the robustness of crowd models to\nannotation errors.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2212.01450v2.pdf","comment":"This paper has been accepted for presentation in 41st IEEE\n International Conference on Consumer Electronics (ICCE 2023), 6-8 January,\n 2023, Las Vegas, USA"},{"id":"http://arxiv.org/abs/2307.03903v2","updated":"2023-07-17T09:08:49Z","published":"2023-07-08T05:03:10Z","title":"Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for\n Visible-Infrared Video Person Re-Identification","summary":" In visible-infrared video person re-identification (re-ID), extracting\nfeatures not affected by complex scenes (such as modality, camera views,\npedestrian pose, background, etc.) changes, and mining and utilizing motion\ninformation are the keys to solving cross-modal pedestrian identity matching.\nTo this end, the paper proposes a new visible-infrared video person re-ID\nmethod from a novel perspective, i.e., adversarial self-attack defense and\nspatial-temporal relation mining. In this work, the changes of views, posture,\nbackground and modal discrepancy are considered as the main factors that cause\nthe perturbations of person identity features. Such interference information\ncontained in the training samples is used as an adversarial perturbation. It\nperforms adversarial attacks on the re-ID model during the training to make the\nmodel more robust to these unfavorable factors. The attack from the adversarial\nperturbation is introduced by activating the interference information contained\nin the input samples without generating adversarial samples, and it can be thus\ncalled adversarial self-attack. This design allows adversarial attack and\ndefense to be integrated into one framework. This paper further proposes a\nspatial-temporal information-guided feature representation network to use the\ninformation in video sequences. The network cannot only extract the information\ncontained in the video-frame sequences but also use the relation of the local\ninformation in space to guide the network to extract more robust features. The\nproposed method exhibits compelling performance on large-scale cross-modality\nvideo datasets. The source code of the proposed method will be released at\nhttps://github.com/lhf12278/xxx.\n","authors":["Huafeng Li","Le Xu","Yafei Zhang","Dapeng Tao","Zhengtao Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03903v2.pdf","comment":"11 pages,8 figures"},{"id":"http://arxiv.org/abs/2212.01452v2","updated":"2023-07-17T09:07:25Z","published":"2022-12-02T21:29:48Z","title":"CLIP: Train Faster with Less Data","summary":" Deep learning models require an enormous amount of data for training.\nHowever, recently there is a shift in machine learning from model-centric to\ndata-centric approaches. In data-centric approaches, the focus is to refine and\nimprove the quality of the data to improve the learning performance of the\nmodels rather than redesigning model architectures. In this paper, we propose\nCLIP i.e., Curriculum Learning with Iterative data Pruning. CLIP combines two\ndata-centric approaches i.e., curriculum learning and dataset pruning to\nimprove the model learning accuracy and convergence speed. The proposed scheme\napplies loss-aware dataset pruning to iteratively remove the least significant\nsamples and progressively reduces the size of the effective dataset in the\ncurriculum learning training. Extensive experiments performed on crowd density\nestimation models validate the notion behind combining the two approaches by\nreducing the convergence time and improving generalization. To our knowledge,\nthe idea of data pruning as an embedded process in curriculum learning is\nnovel.\n","authors":["Muhammad Asif Khan","Ridha Hamila","Hamid Menouar"],"pdf_url":"https://arxiv.org/pdf/2212.01452v2.pdf","comment":"This paper has been accepted for presentation in 2023 International\n Conference on Big Data and Smart Computing, February 13-16, 2023, Jeju, Korea"},{"id":"http://arxiv.org/abs/2301.01947v2","updated":"2023-07-17T09:06:53Z","published":"2023-01-05T08:02:30Z","title":"StitchNet: Composing Neural Networks from Pre-Trained Fragments","summary":" We propose StitchNet, a novel neural network creation paradigm that stitches\ntogether fragments (one or more consecutive network layers) from multiple\npre-trained neural networks. StitchNet allows the creation of high-performing\nneural networks without the large compute and data requirements needed under\ntraditional model creation processes via backpropagation training. We leverage\nCentered Kernel Alignment (CKA) as a compatibility measure to efficiently guide\nthe selection of these fragments in composing a network for a given task\ntailored to specific accuracy needs and computing resource constraints. We then\nshow that these fragments can be stitched together to create neural networks\nwith comparable accuracy to traditionally trained networks at a fraction of\ncomputing resource and data requirements. Finally, we explore a novel\non-the-fly personalized model creation and inference application enabled by\nthis new paradigm.\n","authors":["Surat Teerapittayanon","Marcus Comiter","Brad McDanel","H. T. Kung"],"pdf_url":"https://arxiv.org/pdf/2301.01947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07137v4","updated":"2023-07-17T09:05:16Z","published":"2022-11-14T06:32:18Z","title":"DroneNet: Crowd Density Estimation using Self-ONNs for Drones","summary":" Video surveillance using drones is both convenient and efficient due to the\nease of deployment and unobstructed movement of drones in many scenarios. An\ninteresting application of drone-based video surveillance is to estimate crowd\ndensities (both pedestrians and vehicles) in public places. Deep learning using\nconvolution neural networks (CNNs) is employed for automatic crowd counting and\ndensity estimation using images and videos. However, the performance and\naccuracy of such models typically depend upon the model architecture i.e.,\ndeeper CNN models improve accuracy at the cost of increased inference time. In\nthis paper, we propose a novel crowd density estimation model for drones\n(DroneNet) using Self-organized Operational Neural Networks (Self-ONN).\nSelf-ONN provides efficient learning capabilities with lower computational\ncomplexity as compared to CNN-based models. We tested our algorithm on two\ndrone-view public datasets. Our evaluation shows that the proposed DroneNet\nshows superior performance on an equivalent CNN-based model.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2211.07137v4.pdf","comment":"The paper has been accepted for presentation in 2023 IEEE Consumer\n Communications & Networking Conference (CCNC)"},{"id":"http://arxiv.org/abs/2307.05766v3","updated":"2023-07-17T08:48:22Z","published":"2023-07-11T19:47:05Z","title":"Rad-ReStruct: A Novel VQA Benchmark and Method for Structured Radiology\n Reporting","summary":" Radiology reporting is a crucial part of the communication between\nradiologists and other medical professionals, but it can be time-consuming and\nerror-prone. One approach to alleviate this is structured reporting, which\nsaves time and enables a more accurate evaluation than free-text reports.\nHowever, there is limited research on automating structured reporting, and no\npublic benchmark is available for evaluating and comparing different methods.\nTo close this gap, we introduce Rad-ReStruct, a new benchmark dataset that\nprovides fine-grained, hierarchically ordered annotations in the form of\nstructured reports for X-Ray images. We model the structured reporting task as\nhierarchical visual question answering (VQA) and propose hi-VQA, a novel method\nthat considers prior context in the form of previously asked questions and\nanswers for populating a structured radiology report. Our experiments show that\nhi-VQA achieves competitive performance to the state-of-the-art on the medical\nVQA benchmark VQARad while performing best among methods without\ndomain-specific vision-language pretraining and provides a strong baseline on\nRad-ReStruct. Our work represents a significant step towards the automated\npopulation of structured radiology reports and provides a valuable first\nbenchmark for future research in this area. We will make all annotations and\nour code for annotation generation, model evaluation, and training publicly\navailable upon acceptance. Our dataset and code is available at\nhttps://github.com/ChantalMP/Rad-ReStruct.\n","authors":["Chantal Pellegrini","Matthias Keicher","Ege Özsoy","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2307.05766v3.pdf","comment":"provisionally accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2305.04195v2","updated":"2023-07-17T08:38:53Z","published":"2023-05-07T05:40:48Z","title":"Cross-Modal Retrieval for Motion and Text via MildTriple Loss","summary":" Cross-modal retrieval has become a prominent research topic in computer\nvision and natural language processing with advances made in image-text and\nvideo-text retrieval technologies. However, cross-modal retrieval between human\nmotion sequences and text has not garnered sufficient attention despite the\nextensive application value it holds, such as aiding virtual reality\napplications in better understanding users' actions and language. This task\npresents several challenges, including joint modeling of the two modalities,\ndemanding the understanding of person-centered information from text, and\nlearning behavior features from 3D human motion sequences. Previous work on\nmotion data modeling mainly relied on autoregressive feature extractors that\nmay forget previous information, while we propose an innovative model that\nincludes simple yet powerful transformer-based motion and text encoders, which\ncan learn representations from the two different modalities and capture\nlong-term dependencies. Furthermore, the overlap of the same atomic actions of\ndifferent human motions can cause semantic conflicts, leading us to explore a\nnew triplet loss function, MildTriple Loss. it leverages the similarity between\nsamples in intra-modal space to guide soft-hard negative sample mining in the\njoint embedding space to train the triplet loss and reduce the violation caused\nby false negative samples. We evaluated our model and method on the latest\nHumanML3D and KIT Motion-Language datasets, achieving a 62.9\\% recall for\nmotion retrieval and a 71.5\\% recall for text retrieval (based on R@10) on the\nHumanML3D dataset. Our code is available at\nhttps://github.com/eanson023/rehamot.\n","authors":["Sheng Yan","Haoqiang Wang","Xin Du","Mengyuan Liu","Hong Liu"],"pdf_url":"https://arxiv.org/pdf/2305.04195v2.pdf","comment":"This research was rejected by the submitted journal and needs to be\n revised before submitting"},{"id":"http://arxiv.org/abs/2307.08319v1","updated":"2023-07-17T08:31:59Z","published":"2023-07-17T08:31:59Z","title":"Soft Curriculum for Learning Conditional GANs with Noisy-Labeled and\n Uncurated Unlabeled Data","summary":" Label-noise or curated unlabeled data is used to compensate for the\nassumption of clean labeled data in training the conditional generative\nadversarial network; however, satisfying such an extended assumption is\noccasionally laborious or impractical. As a step towards generative modeling\naccessible to everyone, we introduce a novel conditional image generation\nframework that accepts noisy-labeled and uncurated unlabeled data during\ntraining: (i) closed-set and open-set label noise in labeled data and (ii)\nclosed-set and open-set unlabeled data. To combat it, we propose soft\ncurriculum learning, which assigns instance-wise weights for adversarial\ntraining while assigning new labels for unlabeled data and correcting wrong\nlabels for labeled data. Unlike popular curriculum learning, which uses a\nthreshold to pick the training samples, our soft curriculum controls the effect\nof each training instance by using the weights predicted by the auxiliary\nclassifier, resulting in the preservation of useful samples while ignoring\nharmful ones. Our experiments show that our approach outperforms existing\nsemi-supervised and label-noise robust methods in terms of both quantitative\nand qualitative performance. In particular, the proposed approach is able to\nmatch the performance of (semi-) supervised GANs even with less than half the\nlabeled data.\n","authors":["Kai Katsumata","Duc Minh Vo","Tatsuya Harada","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2307.08319v1.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2307.08318v1","updated":"2023-07-17T08:26:36Z","published":"2023-07-17T08:26:36Z","title":"Airway Label Prediction in Video Bronchoscopy: Capturing Temporal\n Dependencies Utilizing Anatomical Knowledge","summary":" Purpose: Navigation guidance is a key requirement for a multitude of lung\ninterventions using video bronchoscopy. State-of-the-art solutions focus on\nlung biopsies using electromagnetic tracking and intraoperative image\nregistration w.r.t. preoperative CT scans for guidance. The requirement of\npatient-specific CT scans hampers the utilisation of navigation guidance for\nother applications such as intensive care units.\n Methods: This paper addresses navigation guidance solely incorporating\nbronchosopy video data. In contrast to state-of-the-art approaches we entirely\nomit the use of electromagnetic tracking and patient-specific CT scans.\nGuidance is enabled by means of topological bronchoscope localization w.r.t. an\ninterpatient airway model. Particularly, we take maximally advantage of\nanatomical constraints of airway trees being sequentially traversed. This is\nrealized by incorporating sequences of CNN-based airway likelihoods into a\nHidden Markov Model.\n Results: Our approach is evaluated based on multiple experiments inside a\nlung phantom model. With the consideration of temporal context and use of\nanatomical knowledge for regularization, we are able to improve the accuracy up\nto to 0.98 compared to 0.81 (weighted F1: 0.98 compared to 0.81) for a\nclassification based on individual frames.\n Conclusion: We combine CNN-based single image classification of airway\nsegments with anatomical constraints and temporal HMM-based inference for the\nfirst time. Our approach renders vision-only guidance for bronchoscopy\ninterventions in the absence of electromagnetic tracking and patient-specific\nCT scans possible.\n","authors":["Ron Keuth","Mattias Heinrich","Martin Eichenlaub","Marian Himstedt"],"pdf_url":"https://arxiv.org/pdf/2307.08318v1.pdf","comment":"Submitted to International Journal of Computer Assisted Radiology and\n Surgery"},{"id":"http://arxiv.org/abs/2307.08317v1","updated":"2023-07-17T08:24:58Z","published":"2023-07-17T08:24:58Z","title":"AltFreezing for More General Video Face Forgery Detection","summary":" Existing face forgery detection models try to discriminate fake images by\ndetecting only spatial artifacts (e.g., generative artifacts, blending) or\nmainly temporal artifacts (e.g., flickering, discontinuity). They may\nexperience significant performance degradation when facing out-domain\nartifacts. In this paper, we propose to capture both spatial and temporal\nartifacts in one model for face forgery detection. A simple idea is to leverage\na spatiotemporal model (3D ConvNet). However, we find that it may easily rely\non one type of artifact and ignore the other. To address this issue, we present\na novel training strategy called AltFreezing for more general face forgery\ndetection. The AltFreezing aims to encourage the model to detect both spatial\nand temporal artifacts. It divides the weights of a spatiotemporal network into\ntwo groups: spatial-related and temporal-related. Then the two groups of\nweights are alternately frozen during the training process so that the model\ncan learn spatial and temporal features to distinguish real or fake videos.\nFurthermore, we introduce various video-level data augmentation methods to\nimprove the generalization capability of the forgery detection model. Extensive\nexperiments show that our framework outperforms existing methods in terms of\ngeneralization to unseen manipulations and datasets. Code is available at\nhttps: //github.com/ZhendongWang6/AltFreezing.\n","authors":["Zhendong Wang","Jianmin Bao","Wengang Zhou","Weilun Wang","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2307.08317v1.pdf","comment":"Accepted by CVPR 2023 Highlight, code and models are available at\n https: //github.com/ZhendongWang6/AltFreezing"},{"id":"http://arxiv.org/abs/2307.08316v1","updated":"2023-07-17T08:24:05Z","published":"2023-07-17T08:24:05Z","title":"Bridging the Gap: Multi-Level Cross-Modality Joint Alignment for\n Visible-Infrared Person Re-Identification","summary":" Visible-Infrared person Re-IDentification (VI-ReID) is a challenging\ncross-modality image retrieval task that aims to match pedestrians' images\nacross visible and infrared cameras. To solve the modality gap, existing\nmainstream methods adopt a learning paradigm converting the image retrieval\ntask into an image classification task with cross-entropy loss and auxiliary\nmetric learning losses. These losses follow the strategy of adjusting the\ndistribution of extracted embeddings to reduce the intra-class distance and\nincrease the inter-class distance. However, such objectives do not precisely\ncorrespond to the final test setting of the retrieval task, resulting in a new\ngap at the optimization level. By rethinking these keys of VI-ReID, we propose\na simple and effective method, the Multi-level Cross-modality Joint Alignment\n(MCJA), bridging both modality and objective-level gap. For the former, we\ndesign the Modality Alignment Augmentation, which consists of three novel\nstrategies, the weighted grayscale, cross-channel cutmix, and spectrum jitter\naugmentation, effectively reducing modality discrepancy in the image space. For\nthe latter, we introduce a new Cross-Modality Retrieval loss. It is the first\nwork to constrain from the perspective of the ranking list, aligning with the\ngoal of the testing stage. Moreover, based on the global feature only, our\nmethod exhibits good performance and can serve as a strong baseline method for\nthe VI-ReID community.\n","authors":["Tengfei Liang","Yi Jin","Wu Liu","Tao Wang","Songhe Feng","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2307.08316v1.pdf","comment":"10 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.08308v1","updated":"2023-07-17T08:05:30Z","published":"2023-07-17T08:05:30Z","title":"A Novel Multi-Task Model Imitating Dermatologists for Accurate\n Differential Diagnosis of Skin Diseases in Clinical Images","summary":" Skin diseases are among the most prevalent health issues, and accurate\ncomputer-aided diagnosis methods are of importance for both dermatologists and\npatients. However, most of the existing methods overlook the essential domain\nknowledge required for skin disease diagnosis. A novel multi-task model, namely\nDermImitFormer, is proposed to fill this gap by imitating dermatologists'\ndiagnostic procedures and strategies. Through multi-task learning, the model\nsimultaneously predicts body parts and lesion attributes in addition to the\ndisease itself, enhancing diagnosis accuracy and improving diagnosis\ninterpretability. The designed lesion selection module mimics dermatologists'\nzoom-in action, effectively highlighting the local lesion features from noisy\nbackgrounds. Additionally, the presented cross-interaction module explicitly\nmodels the complicated diagnostic reasoning between body parts, lesion\nattributes, and diseases. To provide a more robust evaluation of the proposed\nmethod, a large-scale clinical image dataset of skin diseases with\nsignificantly more cases than existing datasets has been established. Extensive\nexperiments on three different datasets consistently demonstrate the\nstate-of-the-art recognition performance of the proposed approach.\n","authors":["Yan-Jie Zhou","Wei Liu","Yuan Gao","Jing Xu","Le Lu","Yuping Duan","Hao Cheng","Na Jin","Xiaoyong Man","Shuang Zhao","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08308v1.pdf","comment":"MICCAI 2023 early accept"},{"id":"http://arxiv.org/abs/2307.08300v1","updated":"2023-07-17T07:53:23Z","published":"2023-07-17T07:53:23Z","title":"ShiftNAS: Improving One-shot NAS via Probability Shift","summary":" One-shot Neural architecture search (One-shot NAS) has been proposed as a\ntime-efficient approach to obtain optimal subnet architectures and weights\nunder different complexity cases by training only once. However, the subnet\nperformance obtained by weight sharing is often inferior to the performance\nachieved by retraining. In this paper, we investigate the performance gap and\nattribute it to the use of uniform sampling, which is a common approach in\nsupernet training. Uniform sampling concentrates training resources on subnets\nwith intermediate computational resources, which are sampled with high\nprobability. However, subnets with different complexity regions require\ndifferent optimal training strategies for optimal performance. To address the\nproblem of uniform sampling, we propose ShiftNAS, a method that can adjust the\nsampling probability based on the complexity of subnets. We achieve this by\nevaluating the performance variation of subnets with different complexity and\ndesigning an architecture generator that can accurately and efficiently provide\nsubnets with the desired complexity. Both the sampling probability and the\narchitecture generator can be trained end-to-end in a gradient-based manner.\nWith ShiftNAS, we can directly obtain the optimal model architecture and\nparameters for a given computational complexity. We evaluate our approach on\nmultiple visual network models, including convolutional neural networks (CNNs)\nand vision transformers (ViTs), and demonstrate that ShiftNAS is\nmodel-agnostic. Experimental results on ImageNet show that ShiftNAS can improve\nthe performance of one-shot NAS without additional consumption. Source codes\nare available at https://github.com/bestfleer/ShiftNAS.\n","authors":["Mingyang Zhang","Xinyi Yu","Haodong Zhao","Linlin Ou"],"pdf_url":"https://arxiv.org/pdf/2307.08300v1.pdf","comment":"accepted by iccv 2023"},{"id":"http://arxiv.org/abs/2307.05508v2","updated":"2023-07-17T07:52:37Z","published":"2023-07-03T17:23:23Z","title":"Human in the AI loop via xAI and Active Learning for Visual Inspection","summary":" Industrial revolutions have historically disrupted manufacturing by\nintroducing automation into production. Increasing automation reshapes the role\nof the human worker. Advances in robotics and artificial intelligence open new\nfrontiers of human-machine collaboration. Such collaboration can be realized\nconsidering two sub-fields of artificial intelligence: active learning and\nexplainable artificial intelligence. Active learning aims to devise strategies\nthat help obtain data that allows machine learning algorithms to learn better.\nOn the other hand, explainable artificial intelligence aims to make the machine\nlearning models intelligible to the human person. The present work first\ndescribes Industry 5.0, human-machine collaboration, and state-of-the-art\nregarding quality inspection, emphasizing visual inspection. Then it outlines\nhow human-machine collaboration could be realized and enhanced in visual\ninspection. Finally, some of the results obtained in the EU H2020 STAR project\nregarding visual inspection are shared, considering artificial intelligence,\nhuman digital twins, and cybersecurity.\n","authors":["Jože M. Rožanec","Elias Montini","Vincenzo Cutrona","Dimitrios Papamartzivanos","Timotej Klemenčič","Blaž Fortuna","Dunja Mladenić","Entso Veliou","Thanassis Giannetsos","Christos Emmanouilidis"],"pdf_url":"https://arxiv.org/pdf/2307.05508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.12655v3","updated":"2023-07-17T07:28:26Z","published":"2021-07-27T08:08:02Z","title":"MKConv: Multidimensional Feature Representation for Point Cloud Analysis","summary":" Despite the remarkable success of deep learning, an optimal convolution\noperation on point clouds remains elusive owing to their irregular data\nstructure. Existing methods mainly focus on designing an effective continuous\nkernel function that can handle an arbitrary point in continuous space. Various\napproaches exhibiting high performance have been proposed, but we observe that\nthe standard pointwise feature is represented by 1D channels and can become\nmore informative when its representation involves additional spatial feature\ndimensions. In this paper, we present Multidimensional Kernel Convolution\n(MKConv), a novel convolution operator that learns to transform the point\nfeature representation from a vector to a multidimensional matrix. Unlike\nstandard point convolution, MKConv proceeds via two steps. (i) It first\nactivates the spatial dimensions of local feature representation by exploiting\nmultidimensional kernel weights. These spatially expanded features can\nrepresent their embedded information through spatial correlation as well as\nchannel correlation in feature space, carrying more detailed local structure\ninformation. (ii) Then, discrete convolutions are applied to the\nmultidimensional features which can be regarded as a grid-structured matrix. In\nthis way, we can utilize the discrete convolutions for point cloud data without\nvoxelization that suffers from information loss. Furthermore, we propose a\nspatial attention module, Multidimensional Local Attention (MLA), to provide\ncomprehensive structure awareness within the local point set by reweighting the\nspatial feature dimensions. We demonstrate that MKConv has excellent\napplicability to point cloud processing tasks including object classification,\nobject part segmentation, and scene semantic segmentation with superior\nresults.\n","authors":["Sungmin Woo","Dogyoon Lee","Sangwon Hwang","Woojin Kim","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2107.12655v3.pdf","comment":"Accepted by Pattern Recognition 2023"},{"id":"http://arxiv.org/abs/2303.10840v2","updated":"2023-07-17T07:24:30Z","published":"2023-03-20T03:08:22Z","title":"Ref-NeuS: Ambiguity-Reduced Neural Implicit Surface Learning for\n Multi-View Reconstruction with Reflection","summary":" Neural implicit surface learning has shown significant progress in multi-view\n3D reconstruction, where an object is represented by multilayer perceptrons\nthat provide continuous implicit surface representation and view-dependent\nradiance. However, current methods often fail to accurately reconstruct\nreflective surfaces, leading to severe ambiguity. To overcome this issue, we\npropose Ref-NeuS, which aims to reduce ambiguity by attenuating the effect of\nreflective surfaces. Specifically, we utilize an anomaly detector to estimate\nan explicit reflection score with the guidance of multi-view context to\nlocalize reflective surfaces. Afterward, we design a reflection-aware\nphotometric loss that adaptively reduces ambiguity by modeling rendered color\nas a Gaussian distribution, with the reflection score representing the\nvariance. We show that together with a reflection direction-dependent radiance,\nour model achieves high-quality surface reconstruction on reflective surfaces\nand outperforms the state-of-the-arts by a large margin. Besides, our model is\nalso comparable on general surfaces.\n","authors":["Wenhang Ge","Tao Hu","Haoyu Zhao","Shu Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2303.10840v2.pdf","comment":"ICCV 2023, Project webpage: https://g3956.github.io/"},{"id":"http://arxiv.org/abs/2307.08286v1","updated":"2023-07-17T07:16:28Z","published":"2023-07-17T07:16:28Z","title":"Going Beyond Linear Mode Connectivity: The Layerwise Linear Feature\n Connectivity","summary":" Recent work has revealed many intriguing empirical phenomena in neural\nnetwork training, despite the poorly understood and highly complex loss\nlandscapes and training dynamics. One of these phenomena, Linear Mode\nConnectivity (LMC), has gained considerable attention due to the intriguing\nobservation that different solutions can be connected by a linear path in the\nparameter space while maintaining near-constant training and test losses. In\nthis work, we introduce a stronger notion of linear connectivity, Layerwise\nLinear Feature Connectivity (LLFC), which says that the feature maps of every\nlayer in different trained networks are also linearly connected. We provide\ncomprehensive empirical evidence for LLFC across a wide range of settings,\ndemonstrating that whenever two trained networks satisfy LMC (via either\nspawning or permutation methods), they also satisfy LLFC in nearly all the\nlayers. Furthermore, we delve deeper into the underlying factors contributing\nto LLFC, which reveal new insights into the spawning and permutation\napproaches. The study of LLFC transcends and advances our understanding of LMC\nby adopting a feature-learning perspective.\n","authors":["Zhanpeng Zhou","Yongyi Yang","Xiaojiang Yang","Junchi Yan","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2307.08286v1.pdf","comment":"25 pages, 23 figures"},{"id":"http://arxiv.org/abs/2307.08279v1","updated":"2023-07-17T07:01:58Z","published":"2023-07-17T07:01:58Z","title":"Combiner and HyperCombiner Networks: Rules to Combine Multimodality MR\n Images for Prostate Cancer Localisation","summary":" One of the distinct characteristics in radiologists' reading of\nmultiparametric prostate MR scans, using reporting systems such as PI-RADS\nv2.1, is to score individual types of MR modalities, T2-weighted,\ndiffusion-weighted, and dynamic contrast-enhanced, and then combine these\nimage-modality-specific scores using standardised decision rules to predict the\nlikelihood of clinically significant cancer. This work aims to demonstrate that\nit is feasible for low-dimensional parametric models to model such decision\nrules in the proposed Combiner networks, without compromising the accuracy of\npredicting radiologic labels: First, it is shown that either a linear mixture\nmodel or a nonlinear stacking model is sufficient to model PI-RADS decision\nrules for localising prostate cancer. Second, parameters of these (generalised)\nlinear models are proposed as hyperparameters, to weigh multiple networks that\nindependently represent individual image modalities in the Combiner network\ntraining, as opposed to end-to-end modality ensemble. A HyperCombiner network\nis developed to train a single image segmentation network that can be\nconditioned on these hyperparameters during inference, for much improved\nefficiency. Experimental results based on data from 850 patients, for the\napplication of automating radiologist labelling multi-parametric MR, compare\nthe proposed combiner networks with other commonly-adopted end-to-end networks.\nUsing the added advantages of obtaining and interpreting the modality combining\nrules, in terms of the linear weights or odds-ratios on individual image\nmodalities, three clinical applications are presented for prostate cancer\nsegmentation, including modality availability assessment, importance\nquantification and rule discovery.\n","authors":["Wen Yan","Bernard Chiu","Ziyi Shen","Qianye Yang","Tom Syer","Zhe Min","Shonit Punwani","Mark Emberton","David Atkinson","Dean C. Barratt","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2307.08279v1.pdf","comment":"48 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.08278v1","updated":"2023-07-17T06:58:22Z","published":"2023-07-17T06:58:22Z","title":"Adversarial Attacks on Traffic Sign Recognition: A Survey","summary":" Traffic sign recognition is an essential component of perception in\nautonomous vehicles, which is currently performed almost exclusively with deep\nneural networks (DNNs). However, DNNs are known to be vulnerable to adversarial\nattacks. Several previous works have demonstrated the feasibility of\nadversarial attacks on traffic sign recognition models. Traffic signs are\nparticularly promising for adversarial attack research due to the ease of\nperforming real-world attacks using printed signs or stickers. In this work, we\nsurvey existing works performing either digital or real-world attacks on\ntraffic sign detection and classification models. We provide an overview of the\nlatest advancements and highlight the existing research areas that require\nfurther investigation.\n","authors":["Svetlana Pavlitska","Nico Lambing","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2307.08278v1.pdf","comment":"Accepted for publication at ICECCME2023"},{"id":"http://arxiv.org/abs/2307.08268v1","updated":"2023-07-17T06:21:45Z","published":"2023-07-17T06:21:45Z","title":"Liver Tumor Screening and Diagnosis in CT with Pixel-Lesion-Patient\n Network","summary":" Liver tumor segmentation and classification are important tasks in computer\naided diagnosis. We aim to address three problems: liver tumor screening and\npreliminary diagnosis in non-contrast computed tomography (CT), and\ndifferential diagnosis in dynamic contrast-enhanced CT. A novel framework named\nPixel-Lesion-pAtient Network (PLAN) is proposed. It uses a mask transformer to\njointly segment and classify each lesion with improved anchor queries and a\nforeground-enhanced sampling loss. It also has an image-wise classifier to\neffectively aggregate global information and predict patient-level diagnosis. A\nlarge-scale multi-phase dataset is collected containing 939 tumor patients and\n810 normal subjects. 4010 tumor instances of eight types are extensively\nannotated. On the non-contrast tumor screening task, PLAN achieves 95% and 96%\nin patient-level sensitivity and specificity. On contrast-enhanced CT, our\nlesion-level detection precision, recall, and classification accuracy are 92%,\n89%, and 86%, outperforming widely used CNN and transformers for lesion\nsegmentation. We also conduct a reader study on a holdout set of 250 cases.\nPLAN is on par with a senior human radiologist, showing the clinical\nsignificance of our results.\n","authors":["Ke Yan","Xiaoli Yin","Yingda Xia","Fakai Wang","Shu Wang","Yuan Gao","Jiawen Yao","Chunli Li","Xiaoyu Bai","Jingren Zhou","Ling Zhang","Le Lu","Yu Shi"],"pdf_url":"https://arxiv.org/pdf/2307.08268v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.08265v1","updated":"2023-07-17T06:14:19Z","published":"2023-07-17T06:14:19Z","title":"Extreme Image Compression using Fine-tuned VQGAN Models","summary":" Recent advances in generative compression methods have demonstrated\nremarkable progress in enhancing the perceptual quality of compressed data,\nespecially in scenarios with low bitrates. Nevertheless, their efficacy and\napplicability in achieving extreme compression ratios ($<0.1$ bpp) still remain\nconstrained. In this work, we propose a simple yet effective coding framework\nby introducing vector quantization (VQ)-based generative models into the image\ncompression domain. The main insight is that the codebook learned by the VQGAN\nmodel yields strong expressive capacity, facilitating efficient compression of\ncontinuous information in the latent space while maintaining reconstruction\nquality. Specifically, an image can be represented as VQ-indices by finding the\nnearest codeword, which can be encoded using lossless compression methods into\nbitstreams. We then propose clustering a pre-trained large-scale codebook into\nsmaller codebooks using the K-means algorithm. This enables images to be\nrepresented as diverse ranges of VQ-indices maps, resulting in variable\nbitrates and different levels of reconstruction quality. Extensive qualitative\nand quantitative experiments on various datasets demonstrate that the proposed\nframework outperforms the state-of-the-art codecs in terms of perceptual\nquality-oriented metrics and human perception under extremely low bitrates.\n","authors":["Qi Mao","Tinghan Yang","Yinuo Zhang","Shuyin Pan","Meng Wang","Shiqi Wang","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2307.08265v1.pdf","comment":"Generative Compression, Extreme Compression, VQGANs, Low Bitrate"},{"id":"http://arxiv.org/abs/2307.08263v1","updated":"2023-07-17T06:12:26Z","published":"2023-07-17T06:12:26Z","title":"Hierarchical Spatiotemporal Transformers for Video Object Segmentation","summary":" This paper presents a novel framework called HST for semi-supervised video\nobject segmentation (VOS). HST extracts image and video features using the\nlatest Swin Transformer and Video Swin Transformer to inherit their inductive\nbias for the spatiotemporal locality, which is essential for temporally\ncoherent VOS. To take full advantage of the image and video features, HST casts\nimage and video features as a query and memory, respectively. By applying\nefficient memory read operations at multiple scales, HST produces hierarchical\nfeatures for the precise reconstruction of object masks. HST shows\neffectiveness and robustness in handling challenging scenarios with occluded\nand fast-moving objects under cluttered backgrounds. In particular, HST-B\noutperforms the state-of-the-art competitors on multiple popular benchmarks,\ni.e., YouTube-VOS (85.0%), DAVIS 2017 (85.9%), and DAVIS 2016 (94.0%).\n","authors":["Jun-Sang Yoo","Hongjae Lee","Seung-Won Jung"],"pdf_url":"https://arxiv.org/pdf/2307.08263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03323v3","updated":"2023-07-17T06:03:16Z","published":"2023-03-06T17:48:32Z","title":"CleanCLIP: Mitigating Data Poisoning Attacks in Multimodal Contrastive\n Learning","summary":" Multimodal contrastive pretraining has been used to train multimodal\nrepresentation models, such as CLIP, on large amounts of paired image-text\ndata. However, previous studies have revealed that such models are vulnerable\nto backdoor attacks. Specifically, when trained on backdoored examples, CLIP\nlearns spurious correlations between the embedded backdoor trigger and the\ntarget label, aligning their representations in the joint embedding space.\nInjecting even a small number of poisoned examples, such as 75 examples in 3\nmillion pretraining data, can significantly manipulate the model's behavior,\nmaking it difficult to detect or unlearn such correlations. To address this\nissue, we propose CleanCLIP, a finetuning framework that weakens the learned\nspurious associations introduced by backdoor attacks by independently\nre-aligning the representations for individual modalities. We demonstrate that\nunsupervised finetuning using a combination of multimodal contrastive and\nunimodal self-supervised objectives for individual modalities can significantly\nreduce the impact of the backdoor attack. Additionally, we show that supervised\nfinetuning on task-specific labeled image data removes the backdoor trigger\nfrom the CLIP vision encoder. We show empirically that CleanCLIP maintains\nmodel performance on benign examples while erasing a range of backdoor attacks\non multimodal contrastive learning. The code and checkpoints are available at\nhttps://github.com/nishadsinghi/CleanCLIP.\n","authors":["Hritik Bansal","Nishad Singhi","Yu Yang","Fan Yin","Aditya Grover","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2303.03323v3.pdf","comment":"22 pages. Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2212.08071v2","updated":"2023-07-17T05:44:35Z","published":"2022-12-15T18:59:59Z","title":"MAViL: Masked Audio-Video Learners","summary":" We present Masked Audio-Video Learners (MAViL) to train audio-visual\nrepresentations. Our approach learns with three complementary forms of\nself-supervision: (1) reconstruction of masked audio and video input data, (2)\nintra- and inter-modal contrastive learning with masking, and (3) self-training\nby reconstructing joint audio-video contextualized features learned from the\nfirst two objectives. Pre-training with MAViL not only enables the model to\nperform well in audio-visual classification and retrieval tasks but also\nimproves representations of each modality in isolation, without using\ninformation from the other modality for fine-tuning or inference. Empirically,\nMAViL sets a new state-of-the-art on AudioSet (53.1 mAP) and VGGSound (67.1%\naccuracy). For the first time, a self-supervised audio-visual model outperforms\nones that use external supervision on these benchmarks.\n","authors":["Po-Yao Huang","Vasu Sharma","Hu Xu","Chaitanya Ryali","Haoqi Fan","Yanghao Li","Shang-Wen Li","Gargi Ghosh","Jitendra Malik","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2212.08071v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2307.08252v1","updated":"2023-07-17T05:36:01Z","published":"2023-07-17T05:36:01Z","title":"Large-Scale Person Detection and Localization using Overhead Fisheye\n Cameras","summary":" Location determination finds wide applications in daily life. Instead of\nexisting efforts devoted to localizing tourist photos captured by perspective\ncameras, in this article, we focus on devising person positioning solutions\nusing overhead fisheye cameras. Such solutions are advantageous in large field\nof view (FOV), low cost, anti-occlusion, and unaggressive work mode (without\nthe necessity of cameras carried by persons). However, related studies are\nquite scarce, due to the paucity of data. To stimulate research in this\nexciting area, we present LOAF, the first large-scale overhead fisheye dataset\nfor person detection and localization. LOAF is built with many essential\nfeatures, e.g., i) the data cover abundant diversities in scenes, human pose,\ndensity, and location; ii) it contains currently the largest number of\nannotated pedestrian, i.e., 457K bounding boxes with groundtruth location\ninformation; iii) the body-boxes are labeled as radius-aligned so as to fully\naddress the positioning challenge. To approach localization, we build a fisheye\nperson detection network, which exploits the fisheye distortions by a\nrotation-equivariant training strategy and predict radius-aligned human boxes\nend-to-end. Then, the actual locations of the detected persons are calculated\nby a numerical solution on the fisheye model and camera altitude data.\nExtensive experiments on LOAF validate the superiority of our fisheye detector\nw.r.t. previous methods, and show that our whole fisheye positioning solution\nis able to locate all persons in FOV with an accuracy of 0.5 m, within 0.1 s.\n","authors":["Lu Yang","Liulei Li","Xueshi Xin","Yifan Sun","Qing Song","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08252v1.pdf","comment":"ICCV 2023. Project page: https://LOAFisheye.github.io"},{"id":"http://arxiv.org/abs/2307.08249v1","updated":"2023-07-17T05:08:32Z","published":"2023-07-17T05:08:32Z","title":"Random Boxes Are Open-world Object Detectors","summary":" We show that classifiers trained with random region proposals achieve\nstate-of-the-art Open-world Object Detection (OWOD): they can not only maintain\nthe accuracy of the known objects (w/ training labels), but also considerably\nimprove the recall of unknown ones (w/o training labels). Specifically, we\npropose RandBox, a Fast R-CNN based architecture trained on random proposals at\neach training iteration, surpassing existing Faster R-CNN and Transformer based\nOWOD. Its effectiveness stems from the following two benefits introduced by\nrandomness. First, as the randomization is independent of the distribution of\nthe limited known objects, the random proposals become the instrumental\nvariable that prevents the training from being confounded by the known objects.\nSecond, the unbiased training encourages more proposal explorations by using\nour proposed matching score that does not penalize the random proposals whose\nprediction scores do not match the known objects. On two benchmarks:\nPascal-VOC/MS-COCO and LVIS, RandBox significantly outperforms the previous\nstate-of-the-art in all metrics. We also detail the ablations on randomization\nand loss designs. Codes are available at https://github.com/scuwyh2000/RandBox.\n","authors":["Yanghao Wang","Zhongqi Yue","Xian-Sheng Hua","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08249v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08243v1","updated":"2023-07-17T04:55:02Z","published":"2023-07-17T04:55:02Z","title":"Uncertainty-aware State Space Transformer for Egocentric 3D Hand\n Trajectory Forecasting","summary":" Hand trajectory forecasting from egocentric views is crucial for enabling a\nprompt understanding of human intentions when interacting with AR/VR systems.\nHowever, existing methods handle this problem in a 2D image space which is\ninadequate for 3D real-world applications. In this paper, we set up an\negocentric 3D hand trajectory forecasting task that aims to predict hand\ntrajectories in a 3D space from early observed RGB videos in a first-person\nview. To fulfill this goal, we propose an uncertainty-aware state space\nTransformer (USST) that takes the merits of the attention mechanism and\naleatoric uncertainty within the framework of the classical state-space model.\nThe model can be further enhanced by the velocity constraint and visual prompt\ntuning (VPT) on large vision transformers. Moreover, we develop an annotation\nworkflow to collect 3D hand trajectories with high quality. Experimental\nresults on H2O and EgoPAT3D datasets demonstrate the superiority of USST for\nboth 2D and 3D trajectory forecasting. The code and datasets are publicly\nreleased: https://github.com/Cogito2012/USST.\n","authors":["Wentao Bao","Lele Chen","Libing Zeng","Zhong Li","Yi Xu","Junsong Yuan","Yu Kong"],"pdf_url":"https://arxiv.org/pdf/2307.08243v1.pdf","comment":"ICCV 2023 Accepted"},{"id":"http://arxiv.org/abs/2307.08238v1","updated":"2023-07-17T04:39:18Z","published":"2023-07-17T04:39:18Z","title":"Unified Open-Vocabulary Dense Visual Prediction","summary":" In recent years, open-vocabulary (OV) dense visual prediction (such as OV\nobject detection, semantic, instance and panoptic segmentations) has attracted\nincreasing research attention. However, most of existing approaches are\ntask-specific and individually tackle each task. In this paper, we propose a\nUnified Open-Vocabulary Network (UOVN) to jointly address four common dense\nprediction tasks. Compared with separate models, a unified network is more\ndesirable for diverse industrial applications. Moreover, OV dense prediction\ntraining data is relatively less. Separate networks can only leverage\ntask-relevant training data, while a unified approach can integrate diverse\ntraining data to boost individual tasks. We address two major challenges in\nunified OV prediction. Firstly, unlike unified methods for fixed-set\npredictions, OV networks are usually trained with multi-modal data. Therefore,\nwe propose a multi-modal, multi-scale and multi-task (MMM) decoding mechanism\nto better leverage multi-modal data. Secondly, because UOVN uses data from\ndifferent tasks for training, there are significant domain and task gaps. We\npresent a UOVN training mechanism to reduce such gaps. Experiments on four\ndatasets demonstrate the effectiveness of our UOVN.\n","authors":["Hengcan Shi","Munawar Hayat","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2307.08238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08233v1","updated":"2023-07-17T04:25:46Z","published":"2023-07-17T04:25:46Z","title":"ROFusion: Efficient Object Detection using Hybrid Point-wise\n Radar-Optical Fusion","summary":" Radars, due to their robustness to adverse weather conditions and ability to\nmeasure object motions, have served in autonomous driving and intelligent\nagents for years. However, Radar-based perception suffers from its unintuitive\nsensing data, which lack of semantic and structural information of scenes. To\ntackle this problem, camera and Radar sensor fusion has been investigated as a\ntrending strategy with low cost, high reliability and strong maintenance. While\nmost recent works explore how to explore Radar point clouds and images, rich\ncontextual information within Radar observation are discarded. In this paper,\nwe propose a hybrid point-wise Radar-Optical fusion approach for object\ndetection in autonomous driving scenarios. The framework benefits from dense\ncontextual information from both the range-doppler spectrum and images which\nare integrated to learn a multi-modal feature representation. Furthermore, we\npropose a novel local coordinate formulation, tackling the object detection\ntask in an object-centric coordinate. Extensive results show that with the\ninformation gained from optical images, we could achieve leading performance in\nobject detection (97.69\\% recall) compared to recent state-of-the-art methods\nFFT-RadNet (82.86\\% recall). Ablation studies verify the key design choices and\npracticability of our approach given machine generated imperfect detections.\nThe code will be available at https://github.com/LiuLiu-55/ROFusion.\n","authors":["Liu Liu","Shuaifeng Zhi","Zhenhua Du","Li Liu","Xinyu Zhang","Kai Huo","Weidong Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.08233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08228v1","updated":"2023-07-17T04:02:00Z","published":"2023-07-17T04:02:00Z","title":"Video Frame Interpolation with Stereo Event and Intensity Camera","summary":" The stereo event-intensity camera setup is widely applied to leverage the\nadvantages of both event cameras with low latency and intensity cameras that\ncapture accurate brightness and texture information. However, such a setup\ncommonly encounters cross-modality parallax that is difficult to be eliminated\nsolely with stereo rectification especially for real-world scenes with complex\nmotions and varying depths, posing artifacts and distortion for existing\nEvent-based Video Frame Interpolation (E-VFI) approaches. To tackle this\nproblem, we propose a novel Stereo Event-based VFI (SE-VFI) network (SEVFI-Net)\nto generate high-quality intermediate frames and corresponding disparities from\nmisaligned inputs consisting of two consecutive keyframes and event streams\nemitted between them. Specifically, we propose a Feature Aggregation Module\n(FAM) to alleviate the parallax and achieve spatial alignment in the feature\ndomain. We then exploit the fused features accomplishing accurate optical flow\nand disparity estimation, and achieving better interpolated results through\nflow-based and synthesis-based ways. We also build a stereo visual acquisition\nsystem composed of an event camera and an RGB-D camera to collect a new Stereo\nEvent-Intensity Dataset (SEID) containing diverse scenes with complex motions\nand varying depths. Experiments on public real-world stereo datasets, i.e.,\nDSEC and MVSEC, and our SEID dataset demonstrate that our proposed SEVFI-Net\noutperforms state-of-the-art methods by a large margin.\n","authors":["Chao Ding","Mingyuan Lin","Haijian Zhang","Jianzhuang Liu","Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2307.08228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07679v2","updated":"2023-07-17T03:41:26Z","published":"2023-03-14T07:42:02Z","title":"Feature representations useful for predicting image memorability","summary":" Prediction of image memorability has attracted interest in various fields.\nConsequently, the prediction accuracy of convolutional neural network (CNN)\nmodels has been approaching the empirical upper bound estimated based on human\nconsistency. However, identifying which feature representations embedded in CNN\nmodels are responsible for the high memorability prediction accuracy remains an\nopen question. To tackle this problem, we sought to identify\nmemorability-related feature representations in CNN models using brain\nsimilarity. Specifically, memorability prediction accuracy and brain similarity\nwere examined across 16,860 layers in 64 CNN models pretrained for object\nrecognition. A clear tendency was observed in this comprehensive analysis that\nlayers with high memorability prediction accuracy had higher brain similarity\nwith the inferior temporal (IT) cortex, which is the highest stage in the\nventral visual pathway. Furthermore, fine-tuning of the 64 CNN models for\nmemorability prediction revealed that brain similarity with the IT cortex at\nthe penultimate layer positively correlated with the memorability prediction\naccuracy of the models. This analysis also showed that the best fine-tuned\nmodel provided accuracy comparable to state-of-the-art CNN models developed for\nmemorability prediction. Overall, the results of this study indicated that the\nCNN models' great success in predicting memorability relies on feature\nrepresentation acquisition, similar to the IT cortex. This study advances our\nunderstanding of feature representations and their use in predicting image\nmemorability.\n","authors":["Takumi Harada","Hiroyuki Sakai"],"pdf_url":"https://arxiv.org/pdf/2303.07679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11629v3","updated":"2023-07-17T03:33:14Z","published":"2022-11-21T16:43:33Z","title":"PVT++: A Simple End-to-End Latency-Aware Visual Tracking Framework","summary":" Visual object tracking is essential to intelligent robots. Most existing\napproaches have ignored the online latency that can cause severe performance\ndegradation during real-world processing. Especially for unmanned aerial\nvehicles (UAVs), where robust tracking is more challenging and onboard\ncomputation is limited, the latency issue can be fatal. In this work, we\npresent a simple framework for end-to-end latency-aware tracking, i.e.,\nend-to-end predictive visual tracking (PVT++). Unlike existing solutions that\nnaively append Kalman Filters after trackers, PVT++ can be jointly optimized,\nso that it takes not only motion information but can also leverage the rich\nvisual knowledge in most pre-trained tracker models for robust prediction.\nBesides, to bridge the training-evaluation domain gap, we propose a relative\nmotion factor, empowering PVT++ to generalize to the challenging and complex\nUAV tracking scenes. These careful designs have made the small-capacity\nlightweight PVT++ a widely effective solution. Additionally, this work presents\nan extended latency-aware evaluation benchmark for assessing an any-speed\ntracker in the online setting. Empirical results on a robotic platform from the\naerial perspective show that PVT++ can achieve significant performance gain on\nvarious trackers and exhibit higher accuracy than prior solutions, largely\nmitigating the degradation brought by latency.\n","authors":["Bowen Li","Ziyuan Huang","Junjie Ye","Yiming Li","Sebastian Scherer","Hang Zhao","Changhong Fu"],"pdf_url":"https://arxiv.org/pdf/2211.11629v3.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2301.08125v2","updated":"2023-07-17T03:00:47Z","published":"2023-01-19T15:38:43Z","title":"Diagnose Like a Pathologist: Transformer-Enabled Hierarchical\n Attention-Guided Multiple Instance Learning for Whole Slide Image\n Classification","summary":" Multiple Instance Learning (MIL) and transformers are increasingly popular in\nhistopathology Whole Slide Image (WSI) classification. However, unlike human\npathologists who selectively observe specific regions of histopathology tissues\nunder different magnifications, most methods do not incorporate multiple\nresolutions of the WSIs, hierarchically and attentively, thereby leading to a\nloss of focus on the WSIs and information from other resolutions. To resolve\nthis issue, we propose a Hierarchical Attention-Guided Multiple Instance\nLearning framework to fully exploit the WSIs. This framework can dynamically\nand attentively discover the discriminative regions across multiple resolutions\nof the WSIs. Within this framework, an Integrated Attention Transformer is\nproposed to further enhance the performance of the transformer and obtain a\nmore holistic WSI (bag) representation. This transformer consists of multiple\nIntegrated Attention Modules, which is the combination of a transformer layer\nand an aggregation module that produces a bag representation based on every\ninstance representation in that bag. The experimental results show that our\nmethod achieved state-of-the-art performances on multiple datasets, including\nCamelyon16, TCGA-RCC, TCGA-NSCLC, and an in-house IMGC dataset. The code is\navailable at https://github.com/BearCleverProud/HAG-MIL.\n","authors":["Conghao Xiong","Hao Chen","Joseph J. Y. Sung","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2301.08125v2.pdf","comment":"Accepted to IJCAI2023"},{"id":"http://arxiv.org/abs/2307.08209v1","updated":"2023-07-17T02:58:51Z","published":"2023-07-17T02:58:51Z","title":"Ada3D : Exploiting the Spatial Redundancy with Adaptive Inference for\n Efficient 3D Object Detection","summary":" Voxel-based methods have achieved state-of-the-art performance for 3D object\ndetection in autonomous driving. However, their significant computational and\nmemory costs pose a challenge for their application to resource-constrained\nvehicles. One reason for this high resource consumption is the presence of a\nlarge number of redundant background points in Lidar point clouds, resulting in\nspatial redundancy in both 3D voxel and dense BEV map representations. To\naddress this issue, we propose an adaptive inference framework called Ada3D,\nwhich focuses on exploiting the input-level spatial redundancy. Ada3D\nadaptively filters the redundant input, guided by a lightweight importance\npredictor and the unique properties of the Lidar point cloud. Additionally, we\nutilize the BEV features' intrinsic sparsity by introducing the Sparsity\nPreserving Batch Normalization. With Ada3D, we achieve 40% reduction for 3D\nvoxels and decrease the density of 2D BEV feature maps from 100% to 20% without\nsacrificing accuracy. Ada3D reduces the model computational and memory cost by\n5x, and achieves 1.52x/1.45x end-to-end GPU latency and 1.5x/4.5x GPU peak\nmemory optimization for the 3D and 2D backbone respectively.\n","authors":["Tianchen Zhao","Xuefei Ning","Ke Hong","Zhongyuan Qiu","Pu Lu","Yali Zhao","Linfeng Zhang","Lipu Zhou","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08209v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2301.06719v4","updated":"2023-07-17T02:40:42Z","published":"2023-01-17T06:24:08Z","title":"FemtoDet: An Object Detection Baseline for Energy Versus Performance\n Tradeoffs","summary":" Efficient detectors for edge devices are often optimized for parameters or\nspeed count metrics, which remain in weak correlation with the energy of\ndetectors.\n However, some vision applications of convolutional neural networks, such as\nalways-on surveillance cameras, are critical for energy constraints.\n This paper aims to serve as a baseline by designing detectors to reach\ntradeoffs between energy and performance from two perspectives:\n 1) We extensively analyze various CNNs to identify low-energy architectures,\nincluding selecting activation functions, convolutions operators, and feature\nfusion structures on necks. These underappreciated details in past work\nseriously affect the energy consumption of detectors;\n 2) To break through the dilemmatic energy-performance problem, we propose a\nbalanced detector driven by energy using discovered low-energy components named\n\\textit{FemtoDet}.\n In addition to the novel construction, we improve FemtoDet by considering\nconvolutions and training strategy optimizations.\n Specifically, we develop a new instance boundary enhancement (IBE) module for\nconvolution optimization to overcome the contradiction between the limited\ncapacity of CNNs and detection tasks in diverse spatial representations, and\npropose a recursive warm-restart (RecWR) for optimizing training strategy to\nescape the sub-optimization of light-weight detectors by considering the data\nshift produced in popular augmentations.\n As a result, FemtoDet with only 68.77k parameters achieves a competitive\nscore of 46.3 AP50 on PASCAL VOC and 1.11 W $\\&$ 64.47 FPS on Qualcomm\nSnapdragon 865 CPU platforms.\n Extensive experiments on COCO and TJU-DHD datasets indicate that the proposed\nmethod achieves competitive results in diverse scenes.\n","authors":["Peng Tu","Xu Xie","Guo AI","Yuexiang Li","Yawen Huang","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2301.06719v4.pdf","comment":"15 pages, accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08199v1","updated":"2023-07-17T02:03:17Z","published":"2023-07-17T02:03:17Z","title":"Manifold-Guided Sampling in Diffusion Models for Unbiased Image\n Generation","summary":" Diffusion models are a powerful class of generative models that can produce\nhigh-quality images, but they may suffer from data bias. Data bias occurs when\nthe training data does not reflect the true distribution of the data domain,\nbut rather exhibits some skewed or imbalanced patterns. For example, the CelebA\ndataset contains more female images than male images, which can lead to biased\ngeneration results and affect downstream applications. In this paper, we\npropose a novel method to mitigate data bias in diffusion models by applying\nmanifold guidance. Our key idea is to estimate the manifold of the training\ndata using a learnable information-theoretic approach, and then use it to guide\nthe sampling process of diffusion models. In this way, we can encourage the\ngenerated images to be uniformly distributed on the data manifold, without\nchanging the model architecture or requiring labels or retraining. We provide\ntheoretical analysis and empirical evidence to show that our method can improve\nthe quality and unbiasedness of image generation compared to standard diffusion\nmodels.\n","authors":["Xingzhe Su","Wenwen Qiang","Zeen Song","Hang Gao","Fengge Wu","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08198v1","updated":"2023-07-17T01:59:14Z","published":"2023-07-17T01:59:14Z","title":"On Point Affiliation in Feature Upsampling","summary":" We introduce the notion of point affiliation into feature upsampling. By\nabstracting a feature map into non-overlapped semantic clusters formed by\npoints of identical semantic meaning, feature upsampling can be viewed as point\naffiliation -- designating a semantic cluster for each upsampled point. In the\nframework of kernel-based dynamic upsampling, we show that an upsampled point\ncan resort to its low-res decoder neighbors and high-res encoder point to\nreason the affiliation, conditioned on the mutual similarity between them. We\ntherefore present a generic formulation for generating similarity-aware\nupsampling kernels and prove that such kernels encourage not only semantic\nsmoothness but also boundary sharpness. This formulation constitutes a novel,\nlightweight, and universal upsampling solution, Similarity-Aware Point\nAffiliation (SAPA). We show its working mechanism via our preliminary designs\nwith window-shape kernel. After probing the limitations of the designs on\nobject detection, we reveal additional insights for upsampling, leading to SAPA\nwith the dynamic kernel shape. Extensive experiments demonstrate that SAPA\noutperforms prior upsamplers and invites consistent performance improvements on\na number of dense prediction tasks, including semantic segmentation, object\ndetection, instance segmentation, panoptic segmentation, image matting, and\ndepth estimation. Code is made available at: https://github.com/tiny-smart/sapa\n","authors":["Wenze Liu","Hao Lu","Yuliang Liu","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2307.08198v1.pdf","comment":"17 pages. Extended version of NeurIPS 2022 paper \"SAPA:\n Similarity-Aware Point Affiliation for Feature Upsampling\" at\n arXiv:2209.12866v1. arXiv admin note: text overlap with arXiv:2209.12866"},{"id":"http://arxiv.org/abs/2307.08182v1","updated":"2023-07-17T00:56:21Z","published":"2023-07-17T00:56:21Z","title":"Zero-Shot Image Harmonization with Generative Model Prior","summary":" Recent image harmonization methods have demonstrated promising results.\nHowever, due to their heavy reliance on a large number of composite images,\nthese works are expensive in the training phase and often fail to generalize to\nunseen images. In this paper, we draw lessons from human behavior and come up\nwith a zero-shot image harmonization method. Specifically, in the harmonization\nprocess, a human mainly utilizes his long-term prior on harmonious images and\nmakes a composite image close to that prior. To imitate that, we resort to\npretrained generative models for the prior of natural images. For the guidance\nof the harmonization direction, we propose an Attention-Constraint Text which\nis optimized to well illustrate the image environments. Some further designs\nare introduced for preserving the foreground content structure. The resulting\nframework, highly consistent with human behavior, can achieve harmonious\nresults without burdensome training. Extensive experiments have demonstrated\nthe effectiveness of our approach, and we have also explored some interesting\napplications.\n","authors":["Jianqi Chen","Zhengxia Zou","Yilan Zhang","Keyan Chen","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2307.08182v1.pdf","comment":"Code Page: https://github.com/WindVChen/Diff-Harmonization"},{"id":"http://arxiv.org/abs/2011.04408v7","updated":"2023-07-17T23:30:43Z","published":"2020-11-09T13:24:45Z","title":"SeasonDepth: Cross-Season Monocular Depth Prediction Dataset and\n Benchmark under Multiple Environments","summary":" Different environments pose a great challenge to the outdoor robust visual\nperception for long-term autonomous driving, and the generalization of\nlearning-based algorithms on different environments is still an open problem.\nAlthough monocular depth prediction has been well studied recently, few works\nfocus on the robustness of learning-based depth prediction across different\nenvironments, e.g. changing illumination and seasons, owing to the lack of such\na multi-environment real-world dataset and benchmark. To this end, the first\ncross-season monocular depth prediction dataset and benchmark, SeasonDepth, is\nintroduced to benchmark the depth estimation performance under different\nenvironments. We investigate several state-of-the-art representative\nopen-source supervised and self-supervised depth prediction methods using\nnewly-formulated metrics. Through extensive experimental evaluation on the\nproposed dataset and cross-dataset evaluation with current autonomous driving\ndatasets, the performance and robustness against the influence of multiple\nenvironments are analyzed qualitatively and quantitatively. We show that\nlong-term monocular depth prediction is still challenging and believe our work\ncan boost further research on the long-term robustness and generalization for\noutdoor visual perception. The dataset is available on\nhttps://seasondepth.github.io, and the benchmark toolkit is available on\nhttps://github.com/ SeasonDepth/SeasonDepth.\n","authors":["Hanjiang Hu","Baoquan Yang","Zhijian Qiao","Shiqi Liu","Jiacheng Zhu","Zuxin Liu","Wenhao Ding","Ding Zhao","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2011.04408v7.pdf","comment":"Accepted by IROS 2023, 23 pages, 13 figures, 10 tables"},{"id":"http://arxiv.org/abs/2305.13399v4","updated":"2023-07-17T22:31:33Z","published":"2023-05-22T18:25:03Z","title":"Efficient Large-Scale Visual Representation Learning And Evaluation","summary":" In this article, we present our approach to single-modality visual\nrepresentation learning. Understanding visual representations of items is vital\nfor fashion recommendations in e-commerce. We detail and contrast techniques\nused to finetune large-scale visual representation learning models in an\nefficient manner under low-resource settings, including several pretrained\nbackbone architectures, both in the convolutional neural network as well as the\nvision transformer family. We describe the challenges for e-commerce\napplications at-scale and highlight the efforts to more efficiently train,\nevaluate, and serve visual representations. We present ablation studies\nevaluating the representation offline performance for several downstream tasks,\nincluding visually similar ad recommendations on mobile devices. To this end,\nwe present a novel multilingual text-to-image generative offline evaluation\nmethod for visually similar recommendation systems. Finally, we include online\nresults from deployed machine learning systems in production at Etsy.\n","authors":["Eden Dolev","Alaa Awad","Denisa Roberts","Zahra Ebrahimzadeh","Marcin Mejran","Vaibhav Malpani","Mahir Yavuz"],"pdf_url":"https://arxiv.org/pdf/2305.13399v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08880v1","updated":"2023-07-17T22:28:16Z","published":"2023-07-17T22:28:16Z","title":"Modular Neural Network Approaches for Surgical Image Recognition","summary":" Deep learning-based applications have seen a lot of success in recent years.\nText, audio, image, and video have all been explored with great success using\ndeep learning approaches. The use of convolutional neural networks (CNN) in\ncomputer vision, in particular, has yielded reliable results. In order to\nachieve these results, a large amount of data is required. However, the dataset\ncannot always be accessible. Moreover, annotating data can be difficult and\ntime-consuming. Self-training is a semi-supervised approach that managed to\nalleviate this problem and achieve state-of-the-art performances. Theoretical\nanalysis even proved that it may result in a better generalization than a\nnormal classifier. Another problem neural networks can face is the increasing\ncomplexity of modern problems, requiring a high computational and storage cost.\nOne way to mitigate this issue, a strategy that has been inspired by human\ncognition known as modular learning, can be employed. The principle of the\napproach is to decompose a complex problem into simpler sub-tasks. This\napproach has several advantages, including faster learning, better\ngeneralization, and enables interpretability.\n In the first part of this paper, we introduce and evaluate different\narchitectures of modular learning for Dorsal Capsulo-Scapholunate Septum (DCSS)\ninstability classification. Our experiments have shown that modular learning\nimproves performances compared to non-modular systems. Moreover, we found that\nweighted modular, that is to weight the output using the probabilities from the\ngating module, achieved an almost perfect classification. In the second part,\nwe present our approach for data labeling and segmentation with self-training\napplied on shoulder arthroscopy images.\n","authors":["Nosseiba Ben Salem","Younes Bennani","Joseph Karkazan","Abir Barbara","Charles Dacheux","Thomas Gregory"],"pdf_url":"https://arxiv.org/pdf/2307.08880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08850v1","updated":"2023-07-17T21:22:17Z","published":"2023-07-17T21:22:17Z","title":"LiDAR-BEVMTN: Real-Time LiDAR Bird's-Eye View Multi-Task Perception\n Network for Autonomous Driving","summary":" LiDAR is crucial for robust 3D scene perception in autonomous driving. LiDAR\nperception has the largest body of literature after camera perception. However,\nmulti-task learning across tasks like detection, segmentation, and motion\nestimation using LiDAR remains relatively unexplored, especially on\nautomotive-grade embedded platforms. We present a real-time multi-task\nconvolutional neural network for LiDAR-based object detection, semantics, and\nmotion segmentation. The unified architecture comprises a shared encoder and\ntask-specific decoders, enabling joint representation learning. We propose a\nnovel Semantic Weighting and Guidance (SWAG) module to transfer semantic\nfeatures for improved object detection selectively. Our heterogeneous training\nscheme combines diverse datasets and exploits complementary cues between tasks.\nThe work provides the first embedded implementation unifying these key\nperception tasks from LiDAR point clouds achieving 3ms latency on the embedded\nNVIDIA Xavier platform. We achieve state-of-the-art results for two tasks,\nsemantic and motion segmentation, and close to state-of-the-art performance for\n3D object detection. By maximizing hardware efficiency and leveraging\nmulti-task synergies, our method delivers an accurate and efficient solution\ntailored for real-world automated driving deployment. Qualitative results can\nbe seen at https://youtu.be/H-hWRzv2lIY.\n","authors":["Sambit Mohapatra","Senthil Yogamani","Varun Ravi Kumar","Stefan Milz","Heinrich Gotzig","Patrick Mäder"],"pdf_url":"https://arxiv.org/pdf/2307.08850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12306v2","updated":"2023-07-17T21:10:31Z","published":"2023-04-24T17:56:12Z","title":"Segment Anything in Medical Images","summary":" Medical image segmentation is a critical component in clinical practice,\nfacilitating accurate diagnosis, treatment planning, and disease monitoring.\nHowever, current methods predominantly rely on customized models, which exhibit\nlimited generality across diverse tasks. In this study, we present MedSAM, the\ninaugural foundation model designed for universal medical image segmentation.\nHarnessing the power of a meticulously curated dataset comprising over one\nmillion images, MedSAM not only outperforms existing state-of-the-art\nsegmentation foundation models, but also exhibits comparable or even superior\nperformance to specialist models. Moreover, MedSAM enables the precise\nextraction of essential biomarkers for tumor burden quantification. By\ndelivering accurate and efficient segmentation across a wide spectrum of tasks,\nMedSAM holds significant potential to expedite the evolution of diagnostic\ntools and the personalization of treatment plans.\n","authors":["Jun Ma","Yuting He","Feifei Li","Lin Han","Chenyu You","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2304.12306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08837v1","updated":"2023-07-17T20:57:16Z","published":"2023-07-17T20:57:16Z","title":"DARTS: Double Attention Reference-based Transformer for Super-resolution","summary":" We present DARTS, a transformer model for reference-based image\nsuper-resolution. DARTS learns joint representations of two image distributions\nto enhance the content of low-resolution input images through matching\ncorrespondences learned from high-resolution reference images. Current\nstate-of-the-art techniques in reference-based image super-resolution are based\non a multi-network, multi-stage architecture. In this work, we adapt the double\nattention block from the GAN literature, processing the two visual streams\nseparately and combining self-attention and cross-attention blocks through a\ngating attention strategy. Our work demonstrates how the attention mechanism\ncan be adapted for the particular requirements of reference-based image\nsuper-resolution, significantly simplifying the architecture and training\npipeline. We show that our transformer-based model performs competitively with\nstate-of-the-art models, while maintaining a simpler overall architecture and\ntraining process. In particular, we obtain state-of-the-art on the SUN80\ndataset, with a PSNR/SSIM of 29.83 / .809. These results show that attention\nalone is sufficient for the RSR task, without multiple purpose-built\nsubnetworks, knowledge distillation, or multi-stage training.\n","authors":["Masoomeh Aslahishahri","Jordan Ubbens","Ian Stavness"],"pdf_url":"https://arxiv.org/pdf/2307.08837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13108v2","updated":"2023-07-17T20:29:17Z","published":"2022-10-24T10:48:53Z","title":"Heat Demand Forecasting with Multi-Resolutional Representation of\n Heterogeneous Temporal Ensemble","summary":" One of the primal challenges faced by utility companies is ensuring efficient\nsupply with minimal greenhouse gas emissions. The advent of smart meters and\nsmart grids provide an unprecedented advantage in realizing an optimised supply\nof thermal energies through proactive techniques such as load forecasting. In\nthis paper, we propose a forecasting framework for heat demand based on neural\nnetworks where the time series are encoded as scalograms equipped with the\ncapacity of embedding exogenous variables such as weather, and\nholiday/non-holiday. Subsequently, CNNs are utilized to predict the heat load\nmulti-step ahead. Finally, the proposed framework is compared with other\nstate-of-the-art methods, such as SARIMAX and LSTM. The quantitative results\nfrom retrospective experiments show that the proposed framework consistently\noutperforms the state-of-the-art baseline method with real-world data acquired\nfrom Denmark. A minimal mean error of 7.54% for MAPE and 417kW for RMSE is\nachieved with the proposed framework in comparison to all other methods.\n","authors":["Adithya Ramachandran","Satyaki Chatterjee","Siming Bayer","Andreas Maier","Thorkil Flensmark"],"pdf_url":"https://arxiv.org/pdf/2210.13108v2.pdf","comment":"https://www.climatechange.ai/papers/neurips2022/46"},{"id":"http://arxiv.org/abs/2305.04422v2","updated":"2023-07-17T20:18:38Z","published":"2023-05-08T02:28:45Z","title":"Performance Gaps of Artificial Intelligence Models Screening Mammography\n -- Towards Fair and Interpretable Models","summary":" Even though deep learning models for abnormality classification can perform\nwell in screening mammography, the demographic and imaging characteristics\nassociated with increased risk of failure for abnormality classification in\nscreening mammograms remain unclear. This retrospective study used data from\nthe Emory BrEast Imaging Dataset (EMBED) including mammograms from 115,931\npatients imaged at Emory University Healthcare between 2013 to 2020. Clinical\nand imaging data includes Breast Imaging Reporting and Data System (BI-RADS)\nassessment, region of interest coordinates for abnormalities, imaging features,\npathologic outcomes, and patient demographics. Deep learning models including\nInceptionV3, VGG16, ResNet50V2, and ResNet152V2 were developed to distinguish\nbetween patches of abnormal tissue and randomly selected patches of normal\ntissue from the screening mammograms. The distributions of the training,\nvalidation and test sets are 29,144 (55.6%) patches of 10,678 (54.2%) patients,\n9,910 (18.9%) patches of 3,609 (18.3%) patients, and 13,390 (25.5%) patches of\n5,404 (27.5%) patients. We assessed model performance overall and within\nsubgroups defined by age, race, pathologic outcome, and imaging characteristics\nto evaluate reasons for misclassifications. On the test set, a ResNet152V2\nmodel trained to classify normal versus abnormal tissue patches achieved an\naccuracy of 92.6% (95%CI=92.0-93.2%), and area under the receiver operative\ncharacteristics curve 0.975 (95%CI=0.972-0.978). Imaging characteristics\nassociated with higher misclassifications of images include higher tissue\ndensities (risk ratio [RR]=1.649; p=.010, BI-RADS density C and RR=2.026;\np=.003, BI-RADS density D), and presence of architectural distortion (RR=1.026;\np<.001). Small but statistically significant differences in performance were\nobserved by age, race, pathologic outcome, and other imaging features (p<.001).\n","authors":["Linglin Zhang","Beatrice Brown-Mulry","Vineela Nalla","InChan Hwang","Judy Wawira Gichoya","Aimilia Gastounioti","Imon Banerjee","Laleh Seyyed-Kalantari","MinJae Woo","Hari Trivedi"],"pdf_url":"https://arxiv.org/pdf/2305.04422v2.pdf","comment":"21 pages, 4 tables, 5 figures, 2 supplemental table and 1\n supplemental figure"},{"id":"http://arxiv.org/abs/2304.03209v2","updated":"2023-07-17T19:52:20Z","published":"2023-04-06T16:44:03Z","title":"Implicit Anatomical Rendering for Medical Image Segmentation with\n Stochastic Experts","summary":" Integrating high-level semantically correlated contents and low-level\nanatomical features is of central importance in medical image segmentation.\nTowards this end, recent deep learning-based medical segmentation methods have\nshown great promise in better modeling such information. However, convolution\noperators for medical segmentation typically operate on regular grids, which\ninherently blur the high-frequency regions, i.e., boundary regions. In this\nwork, we propose MORSE, a generic implicit neural rendering framework designed\nat an anatomical level to assist learning in medical image segmentation. Our\nmethod is motivated by the fact that implicit neural representation has been\nshown to be more effective in fitting complex signals and solving computer\ngraphics problems than discrete grid-based representation. The core of our\napproach is to formulate medical image segmentation as a rendering problem in\nan end-to-end manner. Specifically, we continuously align the coarse\nsegmentation prediction with the ambiguous coordinate-based point\nrepresentations and aggregate these features to adaptively refine the boundary\nregion. To parallelly optimize multi-scale pixel-level features, we leverage\nthe idea from Mixture-of-Expert (MoE) to design and train our MORSE with a\nstochastic gating mechanism. Our experiments demonstrate that MORSE can work\nwell with different medical segmentation backbones, consistently achieving\ncompetitive performance improvements in both 2D and 3D supervised medical\nsegmentation methods. We also theoretically analyze the superiority of MORSE.\n","authors":["Chenyu You","Weicheng Dai","Yifei Min","Lawrence Staib","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2304.03209v2.pdf","comment":"Accepted at International Conference on Medical Image Computing and\n Computer-Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2304.02689v3","updated":"2023-07-17T19:51:31Z","published":"2023-04-05T18:33:18Z","title":"ACTION++: Improving Semi-supervised Medical Image Segmentation with\n Adaptive Anatomical Contrast","summary":" Medical data often exhibits long-tail distributions with heavy class\nimbalance, which naturally leads to difficulty in classifying the minority\nclasses (i.e., boundary regions or rare objects). Recent work has significantly\nimproved semi-supervised medical image segmentation in long-tailed scenarios by\nequipping them with unsupervised contrastive criteria. However, it remains\nunclear how well they will perform in the labeled portion of data where class\ndistribution is also highly imbalanced. In this work, we present ACTION++, an\nimproved contrastive learning framework with adaptive anatomical contrast for\nsemi-supervised medical segmentation. Specifically, we propose an adaptive\nsupervised contrastive loss, where we first compute the optimal locations of\nclass centers uniformly distributed on the embedding space (i.e., off-line),\nand then perform online contrastive matching training by encouraging different\nclass features to adaptively match these distinct and uniformly distributed\nclass centers. Moreover, we argue that blindly adopting a constant temperature\n$\\tau$ in the contrastive loss on long-tailed medical data is not optimal, and\npropose to use a dynamic $\\tau$ via a simple cosine schedule to yield better\nseparation between majority and minority classes. Empirically, we evaluate\nACTION++ on ACDC and LA benchmarks and show that it achieves state-of-the-art\nacross two semi-supervised settings. Theoretically, we analyze the performance\nof adaptive anatomical contrast and confirm its superiority in label\nefficiency.\n","authors":["Chenyu You","Weicheng Dai","Yifei Min","Lawrence Staib","Jasjeet S. Sekhon","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2304.02689v3.pdf","comment":"Accepted by International Conference on Medical Image Computing and\n Computer-Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2307.08809v1","updated":"2023-07-17T19:50:37Z","published":"2023-07-17T19:50:37Z","title":"Local or Global: Selective Knowledge Assimilation for Federated Learning\n with Limited Labels","summary":" Many existing FL methods assume clients with fully-labeled data, while in\nrealistic settings, clients have limited labels due to the expensive and\nlaborious process of labeling. Limited labeled local data of the clients often\nleads to their local model having poor generalization abilities to their larger\nunlabeled local data, such as having class-distribution mismatch with the\nunlabeled data. As a result, clients may instead look to benefit from the\nglobal model trained across clients to leverage their unlabeled data, but this\nalso becomes difficult due to data heterogeneity across clients. In our work,\nwe propose FedLabel where clients selectively choose the local or global model\nto pseudo-label their unlabeled data depending on which is more of an expert of\nthe data. We further utilize both the local and global models' knowledge via\nglobal-local consistency regularization which minimizes the divergence between\nthe two models' outputs when they have identical pseudo-labels for the\nunlabeled data. Unlike other semi-supervised FL baselines, our method does not\nrequire additional experts other than the local or global model, nor require\nadditional parameters to be communicated. We also do not assume any\nserver-labeled data or fully labeled clients. For both cross-device and\ncross-silo settings, we show that FedLabel outperforms other semi-supervised FL\nbaselines by $8$-$24\\%$, and even outperforms standard fully supervised FL\nbaselines ($100\\%$ labeled data) with only $5$-$20\\%$ of labeled data.\n","authors":["Yae Jee Cho","Gauri Joshi","Dimitrios Dimitriadis"],"pdf_url":"https://arxiv.org/pdf/2307.08809v1.pdf","comment":"To appear in the proceedings of ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08789v1","updated":"2023-07-17T19:17:10Z","published":"2023-07-17T19:17:10Z","title":"Harnessing the Power of AI based Image Generation Model DALLE 2 in\n Agricultural Settings","summary":" This study investigates the potential impact of artificial intelligence (AI)\non the enhancement of visualization processes in the agricultural sector, using\nthe advanced AI image generator, DALLE 2, developed by OpenAI. By\nsynergistically utilizing the natural language processing proficiency of\nchatGPT and the generative prowess of the DALLE 2 model, which employs a\nGenerative Adversarial Networks (GANs) framework, our research offers an\ninnovative method to transform textual descriptors into realistic visual\ncontent. Our rigorously assembled datasets include a broad spectrum of\nagricultural elements such as fruits, plants, and scenarios differentiating\ncrops from weeds, maintained for AI-generated versus original images. The\nquality and accuracy of the AI-generated images were evaluated via established\nmetrics including mean squared error (MSE), peak signal-to-noise ratio (PSNR),\nand feature similarity index (FSIM). The results underline the significant role\nof the DALLE 2 model in enhancing visualization processes in agriculture,\naiding in more informed decision-making, and improving resource distribution.\nThe outcomes of this research highlight the imminent rise of an AI-led\ntransformation in the realm of precision agriculture.\n","authors":["Ranjan Sapkota"],"pdf_url":"https://arxiv.org/pdf/2307.08789v1.pdf","comment":"22 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.08781v1","updated":"2023-07-17T18:50:53Z","published":"2023-07-17T18:50:53Z","title":"The FathomNet2023 Competition Dataset","summary":" Ocean scientists have been collecting visual data to study marine organisms\nfor decades. These images and videos are extremely valuable both for basic\nscience and environmental monitoring tasks. There are tools for automatically\nprocessing these data, but none that are capable of handling the extreme\nvariability in sample populations, image quality, and habitat characteristics\nthat are common in visual sampling of the ocean. Such distribution shifts can\noccur over very short physical distances and in narrow time windows. Creating\nmodels that are able to recognize when an image or video sequence contains a\nnew organism, an unusual collection of animals, or is otherwise out-of-sample\nis critical to fully leverage visual data in the ocean. The FathomNet2023\ncompetition dataset presents a realistic scenario where the set of animals in\nthe target data differs from the training data. The challenge is both to\nidentify the organisms in a target image and assess whether it is\nout-of-sample.\n","authors":["Eric Orenstein","Kevin Barnard","Lonny Lundsten","Geneviève Patterson","Benjamin Woodward","Kakani Katija"],"pdf_url":"https://arxiv.org/pdf/2307.08781v1.pdf","comment":"Competition was presented as part of the 10th Fine Grained Visual\n Categorization workshop at the 2023 Computer Vision and Pattern Recognition\n conference. 4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.08779v1","updated":"2023-07-17T18:50:15Z","published":"2023-07-17T18:50:15Z","title":"Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation","summary":" Low-light conditions not only hamper human visual experience but also degrade\nthe model's performance on downstream vision tasks. While existing works make\nremarkable progress on day-night domain adaptation, they rely heavily on domain\nknowledge derived from the task-specific nighttime dataset. This paper\nchallenges a more complicated scenario with border applicability, i.e.,\nzero-shot day-night domain adaptation, which eliminates reliance on any\nnighttime data. Unlike prior zero-shot adaptation approaches emphasizing either\nimage-level translation or model-level adaptation, we propose a similarity\nmin-max paradigm that considers them under a unified framework. On the image\nlevel, we darken images towards minimum feature similarity to enlarge the\ndomain gap. Then on the model level, we maximize the feature similarity between\nthe darkened images and their normal-light counterparts for better model\nadaptation. To the best of our knowledge, this work represents the pioneering\neffort in jointly optimizing both aspects, resulting in a significant\nimprovement of model generalizability. Extensive experiments demonstrate our\nmethod's effectiveness and broad applicability on various nighttime vision\ntasks, including classification, semantic segmentation, visual place\nrecognition, and video action recognition. Code and pre-trained models are\navailable at https://red-fairy.github.io/ZeroShotDayNightDA-Webpage/.\n","authors":["Rundong Luo","Wenjing Wang","Wenhan Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2307.08779v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08771v1","updated":"2023-07-17T18:31:25Z","published":"2023-07-17T18:31:25Z","title":"UPSCALE: Unconstrained Channel Pruning","summary":" As neural networks grow in size and complexity, inference speeds decline. To\ncombat this, one of the most effective compression techniques -- channel\npruning -- removes channels from weights. However, for multi-branch segments of\na model, channel removal can introduce inference-time memory copies. In turn,\nthese copies increase inference latency -- so much so that the pruned model can\nbe slower than the unpruned model. As a workaround, pruners conventionally\nconstrain certain channels to be pruned together. This fully eliminates memory\ncopies but, as we show, significantly impairs accuracy. We now have a dilemma:\nRemove constraints but increase latency, or add constraints and impair\naccuracy. In response, our insight is to reorder channels at export time, (1)\nreducing latency by reducing memory copies and (2) improving accuracy by\nremoving constraints. Using this insight, we design a generic algorithm UPSCALE\nto prune models with any pruning pattern. By removing constraints from existing\npruners, we improve ImageNet accuracy for post-training pruned models by 2.1\npoints on average -- benefiting DenseNet (+16.9), EfficientNetV2 (+7.9), and\nResNet (+6.2). Furthermore, by reordering channels, UPSCALE improves inference\nspeeds by up to 2x over a baseline export.\n","authors":["Alvin Wan","Hanxiang Hao","Kaushik Patnaik","Yueyang Xu","Omer Hadad","David Güera","Zhile Ren","Qi Shan"],"pdf_url":"https://arxiv.org/pdf/2307.08771v1.pdf","comment":"29 pages, 26 figures, accepted to ICML 2023"},{"id":"http://arxiv.org/abs/2307.08763v1","updated":"2023-07-17T18:19:36Z","published":"2023-07-17T18:19:36Z","title":"Video-Mined Task Graphs for Keystep Recognition in Instructional Videos","summary":" Procedural activity understanding requires perceiving human actions in terms\nof a broader task, where multiple keysteps are performed in sequence across a\nlong video to reach a final goal state -- such as the steps of a recipe or a\nDIY fix-it task. Prior work largely treats keystep recognition in isolation of\nthis broader structure, or else rigidly confines keysteps to align with a\npredefined sequential script. We propose discovering a task graph automatically\nfrom how-to videos to represent probabilistically how people tend to execute\nkeysteps, and then leverage this graph to regularize keystep recognition in\nnovel videos. On multiple datasets of real-world instructional videos, we show\nthe impact: more reliable zero-shot keystep localization and improved video\nrepresentation learning, exceeding the state of the art.\n","authors":["Kumar Ashutosh","Santhosh Kumar Ramakrishnan","Triantafyllos Afouras","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.08763v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.08727v1","updated":"2023-07-17T17:48:06Z","published":"2023-07-17T17:48:06Z","title":"Semantic Counting from Self-Collages","summary":" While recent supervised methods for reference-based object counting continue\nto improve the performance on benchmark datasets, they have to rely on small\ndatasets due to the cost associated with manually annotating dozens of objects\nin images. We propose Unsupervised Counter (UnCo), a model that can learn this\ntask without requiring any manual annotations. To this end, we construct\n\"SelfCollages\", images with various pasted objects as training samples, that\nprovide a rich learning signal covering arbitrary object types and counts. Our\nmethod builds on existing unsupervised representations and segmentation\ntechniques to successfully demonstrate the ability to count objects without\nmanual supervision. Our experiments show that our method not only outperforms\nsimple baselines and generic models such as FasterRCNN, but also matches the\nperformance of supervised counting models in some domains.\n","authors":["Lukas Knobel","Tengda Han","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2307.08727v1.pdf","comment":"24 pages. Code available at\n https://github.com/lukasknobel/SelfCollages"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.08669v1","updated":"2023-07-17T17:32:30Z","published":"2023-07-17T17:32:30Z","title":"Leveraging Recommender Systems to Reduce Content Gaps on Peer Production\n Platforms","summary":" Peer production platforms like Wikipedia commonly suffer from content gaps.\nPrior research suggests recommender systems can help solve this problem, by\nguiding editors towards underrepresented topics. However, it remains unclear\nwhether this approach would result in less relevant recommendations, leading to\nreduced overall engagement with recommended items. To answer this question, we\nfirst conducted offline analyses (Study 1) on SuggestBot, a task-routing\nrecommender system for Wikipedia, then did a three-month controlled experiment\n(Study 2). Our results show that presenting users with articles from\nunderrepresented topics increased the proportion of work done on those articles\nwithout significantly reducing overall recommendation uptake. We discuss the\nimplications of our results, including how ignoring the article discovery\nprocess can artificially narrow recommendations. We draw parallels between this\nphenomenon and the common issue of ``filter bubbles'' to show how any platform\nthat employs recommender systems is susceptible to it.\n","authors":["Mo Houtti","Isaac Johnson","Loren Terveen"],"pdf_url":"https://arxiv.org/pdf/2307.08669v1.pdf","comment":"To appear at the 18th International AAAI Conference on Web and Social\n Media (ICWSM 2024)"},{"id":"http://arxiv.org/abs/2304.09603v3","updated":"2023-07-17T13:13:02Z","published":"2023-04-19T12:17:46Z","title":"Visualising Personal Data Flows: Insights from a Case Study of\n Booking.com","summary":" Commercial organisations are holding and processing an ever-increasing amount\nof personal data. Policies and laws are continually changing to require these\ncompanies to be more transparent regarding the collection, storage, processing\nand sharing of this data. This paper reports our work of taking Booking.com as\na case study to visualise personal data flows extracted from their privacy\npolicy. By showcasing how the company shares its consumers' personal data, we\nraise questions and extend discussions on the challenges and limitations of\nusing privacy policies to inform online users about the true scale and the\nlandscape of personal data flows. This case study can inform us about future\nresearch on more data flow-oriented privacy policy analysis and on the\nconstruction of a more comprehensive ontology on personal data flows in\ncomplicated business ecosystems.\n","authors":["Haiyue Yuan","Matthew Boakes","Xiao Ma","Dongmei Cao","Shujun Li"],"pdf_url":"https://arxiv.org/pdf/2304.09603v3.pdf","comment":"This is the full edition of a paper published in Intelligent\n Information Systems: CAiSE Forum 2023, Zaragoza, Spain, June 12-16, 2023,\n Proceedings, Lecture Notes in Business Information Processing (LNBIP), Volume\n 477, pp. 52-60, 2023, Springer Nature,\n https://link.springer.com/book/10.1007/978-3-031-34674-3"},{"id":"http://arxiv.org/abs/2307.06699v2","updated":"2023-07-17T12:21:55Z","published":"2023-07-13T11:55:03Z","title":"Parmesan: mathematical concept extraction for education","summary":" Mathematics is a highly specialized domain with its own unique set of\nchallenges that has seen limited study in natural language processing. However,\nmathematics is used in a wide variety of fields and multidisciplinary research\nin many different domains often relies on an understanding of mathematical\nconcepts. To aid researchers coming from other fields, we develop a prototype\nsystem for searching for and defining mathematical concepts in context,\nfocusing on the field of category theory. This system, Parmesan, depends on\nnatural language processing components including concept extraction, relation\nextraction, definition extraction, and entity linking. In developing this\nsystem, we show that existing techniques cannot be applied directly to the\ncategory theory domain, and suggest hybrid techniques that do perform well,\nthough we expect the system to evolve over time. We also provide two cleaned\nmathematical corpora that power the prototype system, which are based on\njournal articles and wiki pages, respectively. The corpora have been annotated\nwith dependency trees, lemmas, and part-of-speech tags.\n","authors":["Jacob Collard","Valeria de Paiva","Eswaran Subrahmanian"],"pdf_url":"https://arxiv.org/pdf/2307.06699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08303v1","updated":"2023-07-17T07:55:47Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n Models","summary":" Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08259v1","updated":"2023-07-17T06:03:51Z","published":"2023-07-17T06:03:51Z","title":"Measuring Item Global Residual Value for Fair Recommendation","summary":" In the era of information explosion, numerous items emerge every day,\nespecially in feed scenarios. Due to the limited system display slots and user\nbrowsing attention, various recommendation systems are designed not only to\nsatisfy users' personalized information needs but also to allocate items'\nexposure. However, recent recommendation studies mainly focus on modeling user\npreferences to present satisfying results and maximize user interactions, while\npaying little attention to developing item-side fair exposure mechanisms for\nrational information delivery. This may lead to serious resource allocation\nproblems on the item side, such as the Snowball Effect. Furthermore, unfair\nexposure mechanisms may hurt recommendation performance. In this paper, we call\nfor a shift of attention from modeling user preferences to developing fair\nexposure mechanisms for items. We first conduct empirical analyses of feed\nscenarios to explore exposure problems between items with distinct uploaded\ntimes. This points out that unfair exposure caused by the time factor may be\nthe major cause of the Snowball Effect. Then, we propose to explicitly model\nitem-level customized timeliness distribution, Global Residual Value (GRV), for\nfair resource allocation. This GRV module is introduced into recommendations\nwith the designed Timeliness-aware Fair Recommendation Framework (TaFR).\nExtensive experiments on two datasets demonstrate that TaFR achieves consistent\nimprovements with various backbone recommendation models. By modeling item-side\ncustomized Global Residual Value, we achieve a fairer distribution of resources\nand, at the same time, improve recommendation performance.\n","authors":["Jiayin Wang","Weizhi Ma","Chumeng Jiang","Min Zhang","Yuan Zhang","Biao Li","Peng Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.12781v3","updated":"2023-07-17T02:59:36Z","published":"2022-06-26T03:59:41Z","title":"Efficiently Leveraging Multi-level User Intent for Session-based\n Recommendation via Atten-Mixer Network","summary":" Session-based recommendation (SBR) aims to predict the user's next action\nbased on short and dynamic sessions. Recently, there has been an increasing\ninterest in utilizing various elaborately designed graph neural networks (GNNs)\nto capture the pair-wise relationships among items, seemingly suggesting the\ndesign of more complicated models is the panacea for improving the empirical\nperformance. However, these models achieve relatively marginal improvements\nwith exponential growth in model complexity. In this paper, we dissect the\nclassical GNN-based SBR models and empirically find that some sophisticated GNN\npropagations are redundant, given the readout module plays a significant role\nin GNN-based models. Based on this observation, we intuitively propose to\nremove the GNN propagation part, while the readout module will take on more\nresponsibility in the model reasoning process. To this end, we propose the\nMulti-Level Attention Mixture Network (Atten-Mixer), which leverages both\nconcept-view and instance-view readouts to achieve multi-level reasoning over\nitem transitions. As simply enumerating all possible high-level concepts is\ninfeasible for large real-world recommender systems, we further incorporate\nSBR-related inductive biases, i.e., local invariance and inherent priority to\nprune the search space. Experiments on three benchmarks demonstrate the\neffectiveness and efficiency of our proposal. We also have already launched the\nproposed techniques to a large-scale e-commercial online service since April\n2021, with significant improvements of top-tier business metrics demonstrated\nin the online experiments on live traffic.\n","authors":["Peiyan Zhang","Jiayan Guo","Chaozhuo Li","Yueqi Xie","Jaeboum Kim","Yan Zhang","Xing Xie","Haohan Wang","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2206.12781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08857v1","updated":"2023-07-17T21:32:51Z","published":"2023-07-17T21:32:51Z","title":"An Admissible Shift-Consistent Method for Recommender Systems","summary":" In this paper, we propose a new constraint, called shift-consistency, for\nsolving matrix/tensor completion problems in the context of recommender\nsystems. Our method provably guarantees several key mathematical properties:\n(1) satisfies a recently established admissibility criterion for recommender\nsystems; (2) satisfies a definition of fairness that eliminates a specific\nclass of potential opportunities for users to maliciously influence system\nrecommendations; and (3) offers robustness by exploiting provable uniqueness of\nmissing-value imputation. We provide a rigorous mathematical description of the\nmethod, including its generalization from matrix to tensor form to permit\nrepresentation and exploitation of complex structural relationships among sets\nof user and product attributes. We argue that our analysis suggests a\nstructured means for defining latent-space projections that can permit provable\nperformance properties to be established for machine learning methods.\n","authors":["Tung Nguyen","Jeffrey Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2307.08857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08803v1","updated":"2023-07-17T19:38:40Z","published":"2023-07-17T19:38:40Z","title":"An Exploration Study of Mixed-initiative Query Reformulation in\n Conversational Passage Retrieval","summary":" In this paper, we report our methods and experiments for the TREC\nConversational Assistance Track (CAsT) 2022. In this work, we aim to reproduce\nmulti-stage retrieval pipelines and explore one of the potential benefits of\ninvolving mixed-initiative interaction in conversational passage retrieval\nscenarios: reformulating raw queries. Before the first ranking stage of a\nmulti-stage retrieval pipeline, we propose a mixed-initiative query\nreformulation module, which achieves query reformulation based on the\nmixed-initiative interaction between the users and the system, as the\nreplacement for the neural reformulation method. Specifically, we design an\nalgorithm to generate appropriate questions related to the ambiguities in raw\nqueries, and another algorithm to reformulate raw queries by parsing users'\nfeedback and incorporating it into the raw query. For the first ranking stage\nof our multi-stage pipelines, we adopt a sparse ranking function: BM25, and a\ndense retrieval method: TCT-ColBERT. For the second-ranking step, we adopt a\npointwise reranker: MonoT5, and a pairwise reranker: DuoT5. Experiments on both\nTREC CAsT 2021 and TREC CAsT 2022 datasets show the effectiveness of our\nmixed-initiative-based query reformulation method on improving retrieval\nperformance compared with two popular reformulators: a neural reformulator:\nCANARD-T5 and a rule-based reformulator: historical query reformulator(HQE).\n","authors":["Dayu Yang","Yue Zhang","Hui Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08803v1.pdf","comment":"The Thirty-First Text REtrieval Conference (TREC 2022) Proceedings"},{"id":"http://arxiv.org/abs/2307.08760v1","updated":"2023-07-17T18:07:26Z","published":"2023-07-17T18:07:26Z","title":"Imposing Consistency Properties on Blackbox Systems with Applications to\n SVD-Based Recommender Systems","summary":" In this paper we discuss pre- and post-processing methods to induce desired\nconsistency and/or invariance properties in blackbox systems, e.g., AI-based.\nWe demonstrate our approach in the context of blackbox SVD-based\nmatrix-completion methods commonly used in recommender system (RS)\napplications. We provide empirical results showing that enforcement of\nunit-consistency and shift-consistency, which have provable RS-relevant\nproperties relating to robustness and fairness, also lead to improved\nperformance according to generic RMSE and MAE performance metrics, irrespective\nof the initial chosen hyperparameter.\n","authors":["Tung Nguyen","Jeffrey Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2307.08760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10247v1","updated":"2023-07-17T07:04:31Z","published":"2023-07-17T07:04:31Z","title":"Automated Action Model Acquisition from Narrative Texts","summary":" Action models, which take the form of precondition/effect axioms, facilitate\ncausal and motivational connections between actions for AI agents. Action model\nacquisition has been identified as a bottleneck in the application of planning\ntechnology, especially within narrative planning. Acquiring action models from\nnarrative texts in an automated way is essential, but challenging because of\nthe inherent complexities of such texts. We present NaRuto, a system that\nextracts structured events from narrative text and subsequently generates\nplanning-language-style action models based on predictions of commonsense event\nrelations, as well as textual contradictions and similarities, in an\nunsupervised manner. Experimental results in classical narrative planning\ndomains show that NaRuto can generate action models of significantly better\nquality than existing fully automated methods, and even on par with those of\nsemi-automated methods.\n","authors":["Ruiqi Li","Leyang Cui","Songtuan Lin","Patrik Haslum"],"pdf_url":"https://arxiv.org/pdf/2307.10247v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.10244v1","updated":"2023-07-17T05:35:30Z","published":"2023-07-17T05:35:30Z","title":"Evaluating and Enhancing Robustness of Deep Recommendation Systems\n Against Hardware Errors","summary":" Deep recommendation systems (DRS) heavily depend on specialized HPC hardware\nand accelerators to optimize energy, efficiency, and recommendation quality.\nDespite the growing number of hardware errors observed in large-scale fleet\nsystems where DRS are deployed, the robustness of DRS has been largely\noverlooked. This paper presents the first systematic study of DRS robustness\nagainst hardware errors. We develop Terrorch, a user-friendly, efficient and\nflexible error injection framework on top of the widely-used PyTorch. We\nevaluate a wide range of models and datasets and observe that the DRS\nrobustness against hardware errors is influenced by various factors from model\nparameters to input characteristics. We also explore 3 error mitigation methods\nincluding algorithm based fault tolerance (ABFT), activation clipping and\nselective bit protection (SBP). We find that applying activation clipping can\nrecover up to 30% of the degraded AUC-ROC score, making it a promising\nmitigation method.\n","authors":["Dongning Ma","Xun Jiao","Fred Lin","Mengshi Zhang","Alban Desmaison","Thomas Sellinger","Daniel Moore","Sriram Sankar"],"pdf_url":"https://arxiv.org/pdf/2307.10244v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.08698v1","updated":"2023-07-17T17:57:56Z","published":"2023-07-17T17:57:56Z","title":"Flow Matching in Latent Space","summary":" Flow matching is a recent framework to train generative models that exhibits\nimpressive empirical performance while being relatively easier to train\ncompared with diffusion-based models. Despite its advantageous properties,\nprior methods still face the challenges of expensive computing and a large\nnumber of function evaluations of off-the-shelf solvers in the pixel space.\nFurthermore, although latent-based generative methods have shown great success\nin recent years, this particular model type remains underexplored in this area.\nIn this work, we propose to apply flow matching in the latent spaces of\npretrained autoencoders, which offers improved computational efficiency and\nscalability for high-resolution image synthesis. This enables flow-matching\ntraining on constrained computational resources while maintaining their quality\nand flexibility. Additionally, our work stands as a pioneering contribution in\nthe integration of various conditions into flow matching for conditional\ngeneration tasks, including label-conditioned image generation, image\ninpainting, and semantic-to-image generation. Through extensive experiments,\nour approach demonstrates its effectiveness in both quantitative and\nqualitative results on various datasets, such as CelebA-HQ, FFHQ, LSUN Church &\nBedroom, and ImageNet. We also provide a theoretical control of the\nWasserstein-2 distance between the reconstructed latent flow distribution and\ntrue data distribution, showing it is upper-bounded by the latent flow matching\nobjective. Our code will be available at\nhttps://github.com/VinAIResearch/LFM.git.\n","authors":["Quan Dao","Hao Phung","Binh Nguyen","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2307.08698v1.pdf","comment":"Project Page: https://vinairesearch.github.io/LFM/"},{"id":"http://arxiv.org/abs/2307.08692v1","updated":"2023-07-17T17:52:57Z","published":"2023-07-17T17:52:57Z","title":"A Multiobjective Reinforcement Learning Framework for Microgrid Energy\n Management","summary":" The emergence of microgrids (MGs) has provided a promising solution for\ndecarbonizing and decentralizing the power grid, mitigating the challenges\nposed by climate change. However, MG operations often involve considering\nmultiple objectives that represent the interests of different stakeholders,\nleading to potentially complex conflicts. To tackle this issue, we propose a\nnovel multi-objective reinforcement learning framework that explores the\nhigh-dimensional objective space and uncovers the tradeoffs between conflicting\nobjectives. This framework leverages exogenous information and capitalizes on\nthe data-driven nature of reinforcement learning, enabling the training of a\nparametric policy without the need for long-term forecasts or knowledge of the\nunderlying uncertainty distribution. The trained policies exhibit diverse,\nadaptive, and coordinative behaviors with the added benefit of providing\ninterpretable insights on the dynamics of their information use. We employ this\nframework on the Cornell University MG (CU-MG), which is a combined heat and\npower MG, to evaluate its effectiveness. The results demonstrate performance\nimprovements in all objectives considered compared to the status quo operations\nand offer more flexibility in navigating complex operational tradeoffs.\n","authors":["M. Vivienne Liu","Patrick M. Reed","David Gold","Garret Quist","C. Lindsay Anderson"],"pdf_url":"https://arxiv.org/pdf/2307.08692v1.pdf","comment":"This work will be submitted to the IEEE Transactions on Smart Grid\n for possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2307.08691v1","updated":"2023-07-17T17:50:36Z","published":"2023-07-17T17:50:36Z","title":"FlashAttention-2: Faster Attention with Better Parallelism and Work\n Partitioning","summary":" Scaling Transformers to longer sequence lengths has been a major problem in\nthe last several years, promising to improve performance in language modeling\nand high-resolution image understanding, as well as to unlock new applications\nin code, audio, and video generation. The attention layer is the main\nbottleneck in scaling to longer sequences, as its runtime and memory increase\nquadratically in the sequence length. FlashAttention exploits the asymmetric\nGPU memory hierarchy to bring significant memory saving (linear instead of\nquadratic) and runtime speedup (2-4$\\times$ compared to optimized baselines),\nwith no approximation. However, FlashAttention is still not nearly as fast as\noptimized matrix-multiply (GEMM) operations, reaching only 25-40\\% of the\ntheoretical maximum FLOPs/s. We observe that the inefficiency is due to\nsuboptimal work partitioning between different thread blocks and warps on the\nGPU, causing either low-occupancy or unnecessary shared memory reads/writes. We\npropose FlashAttention-2, with better work partitioning to address these\nissues. In particular, we (1) tweak the algorithm to reduce the number of\nnon-matmul FLOPs (2) parallelize the attention computation, even for a single\nhead, across different thread blocks to increase occupancy, and (3) within each\nthread block, distribute the work between warps to reduce communication through\nshared memory. These yield around 2$\\times$ speedup compared to FlashAttention,\nreaching 50-73\\% of the theoretical maximum FLOPs/s on A100 and getting close\nto the efficiency of GEMM operations. We empirically validate that when used\nend-to-end to train GPT-style models, FlashAttention-2 reaches training speed\nof up to 225 TFLOPs/s per A100 GPU (72\\% model FLOPs utilization).\n","authors":["Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2307.08691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08689v1","updated":"2023-07-17T17:48:51Z","published":"2023-07-17T17:48:51Z","title":"COLLIE: Systematic Construction of Constrained Text Generation Tasks","summary":" Text generation under constraints have seen increasing interests in natural\nlanguage processing, especially with the rapidly improving capabilities of\nlarge language models. However, existing benchmarks for constrained generation\nusually focus on fixed constraint types (e.g.,generate a sentence containing\ncertain words) that have proved to be easy for state-of-the-art models like\nGPT-4. We present COLLIE, a grammar-based framework that allows the\nspecification of rich, compositional constraints with diverse generation levels\n(word, sentence, paragraph, passage) and modeling challenges (e.g.,language\nunderstanding, logical reasoning, counting, semantic planning). We also develop\ntools for automatic extraction of task instances given a constraint structure\nand a raw text corpus. Using COLLIE, we compile the COLLIE-v1 dataset with 2080\ninstances comprising 13 constraint structures. We perform systematic\nexperiments across five state-of-the-art instruction-tuned language models and\nanalyze their performances to reveal shortcomings. COLLIE is designed to be\nextensible and lightweight, and we hope the community finds it useful to\ndevelop more complex constraints and evaluations in the future.\n","authors":["Shunyu Yao","Howard Chen","Austin W. Hanjie","Runzhe Yang","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2307.08689v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.08686v1","updated":"2023-07-17T17:47:50Z","published":"2023-07-17T17:47:50Z","title":"An R package for parametric estimation of causal effects","summary":" This article explains the usage of R package CausalModels, which is publicly\navailable on the Comprehensive R Archive Network. While packages are available\nfor sufficiently estimating causal effects, there lacks a package that provides\na collection of structural models using the conventional statistical approach\ndeveloped by Hern\\'an and Robins (2020). CausalModels addresses this deficiency\nof software in R concerning causal inference by offering tools for methods that\naccount for biases in observational data without requiring extensive\nstatistical knowledge. These methods should not be ignored and may be more\nappropriate or efficient in solving particular problems. While implementations\nof these statistical models are distributed among a number of causal packages,\nCausalModels introduces a simple and accessible framework for a consistent\nmodeling pipeline among a variety of statistical methods for estimating causal\neffects in a single R package. It consists of common methods including\nstandardization, IP weighting, G-estimation, outcome regression, instrumental\nvariables and propensity matching.\n","authors":["Joshua Wolff Anderson","Cyril Rakovsk"],"pdf_url":"https://arxiv.org/pdf/2307.08686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08684v1","updated":"2023-07-17T17:46:08Z","published":"2023-07-17T17:46:08Z","title":"A Rubik's Cube inspired approach to Clifford synthesis","summary":" The problem of decomposing an arbitrary Clifford element into a sequence of\nClifford gates is known as Clifford synthesis. Drawing inspiration from\nsimilarities between this and the famous Rubik's Cube problem, we develop a\nmachine learning approach for Clifford synthesis based on learning an\napproximation to the distance to the identity. This approach is probabilistic\nand computationally intensive. However, when a decomposition is successfully\nfound, it often involves fewer gates than existing synthesis algorithms.\nAdditionally, our approach is much more flexible than existing algorithms in\nthat arbitrary gate sets, device topologies, and gate fidelities may\nincorporated, thus allowing for the approach to be tailored to a specific\ndevice.\n","authors":["Ning Bao","Gavin S. Hartnett"],"pdf_url":"https://arxiv.org/pdf/2307.08684v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.08678v1","updated":"2023-07-17T17:41:47Z","published":"2023-07-17T17:41:47Z","title":"Do Models Explain Themselves? Counterfactual Simulatability of Natural\n Language Explanations","summary":" Large language models (LLMs) are trained to imitate humans to explain human\ndecisions. However, do LLMs explain themselves? Can they help humans build\nmental models of how LLMs process different inputs? To answer these questions,\nwe propose to evaluate $\\textbf{counterfactual simulatability}$ of natural\nlanguage explanations: whether an explanation can enable humans to precisely\ninfer the model's outputs on diverse counterfactuals of the explained input.\nFor example, if a model answers \"yes\" to the input question \"Can eagles fly?\"\nwith the explanation \"all birds can fly\", then humans would infer from the\nexplanation that it would also answer \"yes\" to the counterfactual input \"Can\npenguins fly?\". If the explanation is precise, then the model's answer should\nmatch humans' expectations.\n We implemented two metrics based on counterfactual simulatability: precision\nand generality. We generated diverse counterfactuals automatically using LLMs.\nWe then used these metrics to evaluate state-of-the-art LLMs (e.g., GPT-4) on\ntwo tasks: multi-hop factual reasoning and reward modeling. We found that LLM's\nexplanations have low precision and that precision does not correlate with\nplausibility. Therefore, naively optimizing human approvals (e.g., RLHF) may\nnot be a sufficient solution.\n","authors":["Yanda Chen","Ruiqi Zhong","Narutatsu Ri","Chen Zhao","He He","Jacob Steinhardt","Zhou Yu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2307.08678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08674v1","updated":"2023-07-17T17:36:09Z","published":"2023-07-17T17:36:09Z","title":"TableGPT: Towards Unifying Tables, Nature Language and Commands into One\n GPT","summary":" Tables are prevalent in real-world databases, requiring significant time and\neffort for humans to analyze and manipulate. The advancements in large language\nmodels (LLMs) have made it possible to interact with tables using natural\nlanguage input, bringing this capability closer to reality. In this paper, we\npresent TableGPT, a unified fine-tuned framework that enables LLMs to\nunderstand and operate on tables using external functional commands. It\nintroduces the capability to seamlessly interact with tables, enabling a wide\nrange of functionalities such as question answering, data manipulation (e.g.,\ninsert, delete, query, and modify operations), data visualization, analysis\nreport generation, and automated prediction. TableGPT aims to provide\nconvenience and accessibility to users by empowering them to effortlessly\nleverage tabular data. At the core of TableGPT lies the novel concept of global\ntabular representations, which empowers LLMs to gain a comprehensive\nunderstanding of the entire table beyond meta-information. By jointly training\nLLMs on both table and text modalities, TableGPT achieves a deep understanding\nof tabular data and the ability to perform complex operations on tables through\nchain-of-command instructions. Importantly, TableGPT offers the advantage of\nbeing a self-contained system rather than relying on external API interfaces.\nMoreover, it supports efficient data process flow, query rejection (when\nappropriate) and private deployment, enabling faster domain data fine-tuning\nand ensuring data privacy, which enhances the framework's adaptability to\nspecific use cases.\n","authors":["Liangyu Zha","Junlin Zhou","Liyao Li","Rui Wang","Qingyi Huang","Saisai Yang","Jing Yuan","Changbao Su","Xiang Li","Aofeng Su","Tao Zhang","Chen Zhou","Kaizhe Shou","Miao Wang","Wufang Zhu","Guoshan Lu","Chao Ye","Yali Ye","Wentao Ye","Yiming Zhang","Xinglong Deng","Jie Xu","Haobo Wang","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.08674v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.08673v1","updated":"2023-07-17T17:34:32Z","published":"2023-07-17T17:34:32Z","title":"CohortFinder: an open-source tool for data-driven partitioning of\n biomedical image cohorts to yield robust machine learning models","summary":" Batch effects (BEs) refer to systematic technical differences in data\ncollection unrelated to biological variations whose noise is shown to\nnegatively impact machine learning (ML) model generalizability. Here we release\nCohortFinder, an open-source tool aimed at mitigating BEs via data-driven\ncohort partitioning. We demonstrate CohortFinder improves ML model performance\nin downstream medical image processing tasks. CohortFinder is freely available\nfor download at cohortfinder.com.\n","authors":["Fan Fan","Georgia Martinez","Thomas Desilvio","John Shin","Yijiang Chen","Bangchen Wang","Takaya Ozeki","Maxime W. Lafarge","Viktor H. Koelzer","Laura Barisoni","Anant Madabhushi","Satish E. Viswanath","Andrew Janowczyk"],"pdf_url":"https://arxiv.org/pdf/2307.08673v1.pdf","comment":"26 pages, 9 figures, 4 tables. Abstract was accepted by European\n Society of Digital and Integrative Pathology (ESDIP), Germany, 2022"},{"id":"http://arxiv.org/abs/2307.08657v1","updated":"2023-07-17T17:14:17Z","published":"2023-07-17T17:14:17Z","title":"Neural Image Compression: Generalization, Robustness, and Spectral\n Biases","summary":" Recent neural image compression (NIC) advances have produced models which are\nstarting to outperform traditional codecs. While this has led to growing\nexcitement about using NIC in real-world applications, the successful adoption\nof any machine learning system in the wild requires it to generalize (and be\nrobust) to unseen distribution shifts at deployment. Unfortunately, current\nresearch lacks comprehensive datasets and informative tools to evaluate and\nunderstand NIC performance in real-world settings. To bridge this crucial gap,\nfirst, this paper presents a comprehensive benchmark suite to evaluate the\nout-of-distribution (OOD) performance of image compression methods.\nSpecifically, we provide CLIC-C and Kodak-C by introducing 15 corruptions to\npopular CLIC and Kodak benchmarks. Next, we propose spectrally inspired\ninspection tools to gain deeper insight into errors introduced by image\ncompression methods as well as their OOD performance. We then carry out a\ndetailed performance comparison of a classical codec with several NIC variants,\nrevealing intriguing findings that challenge our current understanding of the\nstrengths and limitations of NIC. Finally, we corroborate our empirical\nfindings with theoretical analysis, providing an in-depth view of the OOD\nperformance of NIC and its dependence on the spectral properties of the data.\nOur benchmarks, spectral inspection tools, and findings provide a crucial\nbridge to the real-world adoption of NIC. We hope that our work will propel\nfuture efforts in designing robust and generalizable NIC methods. Code and data\nwill be made available at https://github.com/klieberman/ood_nic.\n","authors":["Kelsey Lieberman","James Diffenderfer","Charles Godfrey","Bhavya Kailkhura"],"pdf_url":"https://arxiv.org/pdf/2307.08657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13253v2","updated":"2023-07-17T17:05:23Z","published":"2023-06-23T00:44:26Z","title":"Predicting Grokking Long Before it Happens: A look into the loss\n landscape of models which grok","summary":" This paper focuses on predicting the occurrence of grokking in neural\nnetworks, a phenomenon in which perfect generalization emerges long after signs\nof overfitting or memorization are observed. It has been reported that grokking\ncan only be observed with certain hyper-parameters. This makes it critical to\nidentify the parameters that lead to grokking. However, since grokking occurs\nafter a large number of epochs, searching for the hyper-parameters that lead to\nit is time-consuming. In this paper, we propose a low-cost method to predict\ngrokking without training for a large number of epochs. In essence, by studying\nthe learning curve of the first few epochs, we show that one can predict\nwhether grokking will occur later on. Specifically, if certain oscillations\noccur in the early epochs, one can expect grokking to occur if the model is\ntrained for a much longer period of time. We propose using the spectral\nsignature of a learning curve derived by applying the Fourier transform to\nquantify the amplitude of low-frequency components to detect the presence of\nsuch oscillations. We also present additional experiments aimed at explaining\nthe cause of these oscillations and characterizing the loss landscape.\n","authors":["Pascal Jr. Tikeng Notsawo","Hattie Zhou","Mohammad Pezeshki","Irina Rish","Guillaume Dumas"],"pdf_url":"https://arxiv.org/pdf/2306.13253v2.pdf","comment":"26 pages, 31 figures"},{"id":"http://arxiv.org/abs/2307.08643v1","updated":"2023-07-17T16:57:01Z","published":"2023-07-17T16:57:01Z","title":"A General Framework for Learning under Corruption: Label Noise,\n Attribute Noise, and Beyond","summary":" Corruption is frequently observed in collected data and has been extensively\nstudied in machine learning under different corruption models. Despite this,\nthere remains a limited understanding of how these models relate such that a\nunified view of corruptions and their consequences on learning is still\nlacking. In this work, we formally analyze corruption models at the\ndistribution level through a general, exhaustive framework based on Markov\nkernels. We highlight the existence of intricate joint and dependent\ncorruptions on both labels and attributes, which are rarely touched by existing\nresearch. Further, we show how these corruptions affect standard supervised\nlearning by analyzing the resulting changes in Bayes Risk. Our findings offer\nqualitative insights into the consequences of \"more complex\" corruptions on the\nlearning problem, and provide a foundation for future quantitative comparisons.\nApplications of the framework include corruption-corrected learning, a subcase\nof which we study in this paper by theoretically analyzing loss correction with\nrespect to different corruption instances.\n","authors":["Laura Iacovissi","Nan Lu","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2307.08643v1.pdf","comment":"42 pages"},{"id":"http://arxiv.org/abs/2307.08637v1","updated":"2023-07-17T16:53:22Z","published":"2023-07-17T16:53:22Z","title":"LearnedSort as a learning-augmented SampleSort: Analysis and\n Parallelization","summary":" This work analyzes and parallelizes LearnedSort, the novel algorithm that\nsorts using machine learning models based on the cumulative distribution\nfunction. LearnedSort is analyzed under the lens of algorithms with\npredictions, and it is argued that LearnedSort is a learning-augmented\nSampleSort. A parallel LearnedSort algorithm is developed combining LearnedSort\nwith the state-of-the-art SampleSort implementation, IPS4o. Benchmarks on\nsynthetic and real-world datasets demonstrate improved parallel performance for\nparallel LearnedSort compared to IPS4o and other sorting algorithms.\n","authors":["Ivan Carvalho","Ramon Lawrence"],"pdf_url":"https://arxiv.org/pdf/2307.08637v1.pdf","comment":"Published in SSDBM 2023"},{"id":"http://arxiv.org/abs/2307.08621v1","updated":"2023-07-17T16:40:01Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":" In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08617v1","updated":"2023-07-17T16:32:49Z","published":"2023-07-17T16:32:49Z","title":"Understanding the impacts of crop diversification in the context of\n climate change: a machine learning approach","summary":" The concept of sustainable intensification in agriculture necessitates the\nimplementation of management practices that prioritize sustainability without\ncompromising productivity. However, the effects of such practices are known to\ndepend on environmental conditions, and are therefore expected to change as a\nresult of a changing climate. We study the impact of crop diversification on\nproductivity in the context of climate change. We leverage heterogeneous Earth\nObservation data and contribute a data-driven approach based on causal machine\nlearning for understanding how crop diversification impacts may change in the\nfuture. We apply this method to the country of Cyprus throughout a 4-year\nperiod. We find that, on average, crop diversification significantly benefited\nthe net primary productivity of crops, increasing it by 2.8%. The effect\ngenerally synergized well with higher maximum temperatures and lower soil\nmoistures. In a warmer and more drought-prone climate, we conclude that crop\ndiversification exhibits promising adaptation potential and is thus a sensible\npolicy choice with regards to agricultural productivity for present and future.\n","authors":["Georgios Giannarakis","Ilias Tsoumas","Stelios Neophytides","Christiana Papoutsa","Charalampos Kontoes","Diofantos Hadjimitsis"],"pdf_url":"https://arxiv.org/pdf/2307.08617v1.pdf","comment":"Accepted for oral presentation at ISPRS Geospatial Week 2023"},{"id":"http://arxiv.org/abs/2307.08616v1","updated":"2023-07-17T16:30:56Z","published":"2023-07-17T16:30:56Z","title":"Temporal and Geographical Analysis of Real Economic Activities in the\n Bitcoin Blockchain","summary":" We study the real economic activity in the Bitcoin blockchain that involves\ntransactions from/to retail users rather than between organizations such as\nmarketplaces, exchanges, or other services. We first introduce a heuristic\nmethod to classify Bitcoin players into three main categories: Frequent\nReceivers (FR), Neighbors of FR, and Others. We show that most real\ntransactions involve Frequent Receivers, representing a small fraction of the\ntotal value exchanged according to the blockchain, but a significant fraction\nof all payments, raising concerns about the centralization of the Bitcoin\necosystem. We also conduct a weekly pattern analysis of activity, providing\ninsights into the geographical location of Bitcoin users and allowing us to\nquantify the bias of a well-known dataset for actor identification.\n","authors":["Rafael Ramos Tubino","Remy Cazabet","Natkamon Tovanich","Celine Robardet"],"pdf_url":"https://arxiv.org/pdf/2307.08616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13192v2","updated":"2023-07-17T16:27:52Z","published":"2023-01-30T18:54:54Z","title":"Robust empirical risk minimization via Newton's method","summary":" A new variant of Newton's method for empirical risk minimization is studied,\nwhere at each iteration of the optimization algorithm, the gradient and Hessian\nof the objective function are replaced by robust estimators taken from existing\nliterature on robust mean estimation for multivariate data. After proving a\ngeneral theorem about the convergence of successive iterates to a small ball\naround the population-level minimizer, consequences of the theory in\ngeneralized linear models are studied when data are generated from Huber's\nepsilon-contamination model and/or heavytailed distributions. An algorithm for\nobtaining robust Newton directions based on the conjugate gradient method is\nalso proposed, which may be more appropriate for high-dimensional settings, and\nconjectures about the convergence of the resulting algorithm are offered.\nCompared to robust gradient descent, the proposed algorithm enjoys the faster\nrates of convergence for successive iterates often achieved by second-order\nalgorithms for convex problems, i.e., quadratic convergence in a neighborhood\nof the optimum, with a stepsize that may be chosen adaptively via backtracking\nlinesearch.\n","authors":["Eirini Ioannou","Muni Sreenivas Pydi","Po-Ling Loh"],"pdf_url":"https://arxiv.org/pdf/2301.13192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10773v3","updated":"2023-07-17T16:13:03Z","published":"2022-11-19T19:06:36Z","title":"A Two-Stage Active Learning Algorithm for $k$-Nearest Neighbors","summary":" $k$-nearest neighbor classification is a popular non-parametric method\nbecause of desirable properties like automatic adaption to distributional scale\nchanges. Unfortunately, it has thus far proved difficult to design active\nlearning strategies for the training of local voting-based classifiers that\nnaturally retain these desirable properties, and hence active learning\nstrategies for $k$-nearest neighbor classification have been conspicuously\nmissing from the literature. In this work, we introduce a simple and intuitive\nactive learning algorithm for the training of $k$-nearest neighbor classifiers,\nthe first in the literature which retains the concept of the $k$-nearest\nneighbor vote at prediction time. We provide consistency guarantees for a\nmodified $k$-nearest neighbors classifier trained on samples acquired via our\nscheme, and show that when the conditional probability function\n$\\mathbb{P}(Y=y|X=x)$ is sufficiently smooth and the Tsybakov noise condition\nholds, our actively trained classifiers converge to the Bayes optimal\nclassifier at a faster asymptotic rate than passively trained $k$-nearest\nneighbor classifiers.\n","authors":["Nick Rittler","Kamalika Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2211.10773v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03403v4","updated":"2023-07-17T16:12:20Z","published":"2023-05-05T09:58:40Z","title":"LLMs for Semi-Automated Data Science: Introducing CAAFE for\n Context-Aware Automated Feature Engineering","summary":" As the field of automated machine learning (AutoML) advances, it becomes\nincreasingly important to incorporate domain knowledge into these systems. We\npresent an approach for doing so by harnessing the power of large language\nmodels (LLMs). Specifically, we introduce Context-Aware Automated Feature\nEngineering (CAAFE), a feature engineering method for tabular datasets that\nutilizes an LLM to iteratively generate additional semantically meaningful\nfeatures for tabular datasets based on the description of the dataset. The\nmethod produces both Python code for creating new features and explanations for\nthe utility of the generated features.\n Despite being methodologically simple, CAAFE improves performance on 11 out\nof 14 datasets - boosting mean ROC AUC performance from 0.798 to 0.822 across\nall dataset - similar to the improvement achieved by using a random forest\ninstead of logistic regression on our datasets.\n Furthermore, CAAFE is interpretable by providing a textual explanation for\neach generated feature. CAAFE paves the way for more extensive semi-automation\nin data science tasks and emphasizes the significance of context-aware\nsolutions that can extend the scope of AutoML systems to semantic AutoML. We\nrelease our $\\href{https://github.com/automl/CAAFE}{code}$, a simple\n$\\href{https://colab.research.google.com/drive/1mCA8xOAJZ4MaB_alZvyARTMjhl6RZf0a}{demo}$\nand a $\\href{https://pypi.org/project/caafe/}{python\\ package}$.\n","authors":["Noah Hollmann","Samuel Müller","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2305.03403v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08593v1","updated":"2023-07-17T16:03:35Z","published":"2023-07-17T16:03:35Z","title":"Artificial Intelligence for the Electron Ion Collider (AI4EIC)","summary":" The Electron-Ion Collider (EIC), a state-of-the-art facility for studying the\nstrong force, is expected to begin commissioning its first experiments in 2028.\nThis is an opportune time for artificial intelligence (AI) to be included from\nthe start at this facility and in all phases that lead up to the experiments.\nThe second annual workshop organized by the AI4EIC working group, which\nrecently took place, centered on exploring all current and prospective\napplication areas of AI for the EIC. This workshop is not only beneficial for\nthe EIC, but also provides valuable insights for the newly established ePIC\ncollaboration at EIC. This paper summarizes the different activities and R&D\nprojects covered across the sessions of the workshop and provides an overview\nof the goals, approaches and strategies regarding AI/ML in the EIC community,\nas well as cutting-edge techniques currently studied in other experiments.\n","authors":["C. Allaire","R. Ammendola","E. -C. Aschenauer","M. Balandat","M. Battaglieri","J. Bernauer","M. Bondì","N. Branson","T. Britton","A. Butter","I. Chahrour","P. Chatagnon","E. Cisbani","E. W. Cline","S. Dash","C. Dean","W. Deconinck","A. Deshpande","M. Diefenthaler","R. Ent","C. Fanelli","M. Finger","M. Finger, Jr.","E. Fol","S. Furletov","Y. Gao","J. Giroux","N. C. Gunawardhana Waduge","R. Harish","O. Hassan","P. L. Hegde","R. J. Hernández-Pinto","A. Hiller Blin","T. Horn","J. Huang","D. Jayakodige","B. Joo","M. Junaid","P. Karande","B. Kriesten","R. Kunnawalkam Elayavalli","M. Lin","F. Liu","S. Liuti","G. Matousek","M. McEneaney","D. McSpadden","T. Menzo","T. Miceli","V. Mikuni","R. Montgomery","B. Nachman","R. R. Nair","J. Niestroy","S. A. Ochoa Oregon","J. Oleniacz","J. D. Osborn","C. Paudel","C. Pecar","C. Peng","G. N. Perdue","W. Phelps","M. L. Purschke","K. Rajput","Y. Ren","D. F. Renteria-Estrada","D. Richford","B. J. Roy","D. Roy","N. Sato","T. Satogata","G. Sborlini","M. Schram","D. Shih","J. Singh","R. Singh","A. Siodmok","P. Stone","J. Stevens","L. Suarez","K. Suresh","A. -N. Tawfik","F. Torales Acosta","N. Tran","R. Trotta","F. J. Twagirayezu","R. Tyson","S. Volkova","A. Vossen","E. Walter","D. Whiteson","M. Williams","S. Wu","N. Zachariou","P. Zurita"],"pdf_url":"https://arxiv.org/pdf/2307.08593v1.pdf","comment":"27 pages, 11 figures, AI4EIC workshop, tutorials and hackathon"},{"id":"http://arxiv.org/abs/2307.06324v3","updated":"2023-07-17T16:03:26Z","published":"2023-07-12T17:41:07Z","title":"Provably Faster Gradient Descent via Long Steps","summary":" This work establishes provably faster convergence rates for gradient descent\nvia a computer-assisted analysis technique. Our theory allows nonconstant\nstepsize policies with frequent long steps potentially violating descent by\nanalyzing the overall effect of many iterations at once rather than the typical\none-iteration inductions used in most first-order method analyses. We show that\nlong steps, which may increase the objective value in the short term, lead to\nprovably faster convergence in the long term. A conjecture towards proving a\nfaster $O(1/T\\log T)$ rate for gradient descent is also motivated along with\nsimple numerical validation.\n","authors":["Benjamin Grimmer"],"pdf_url":"https://arxiv.org/pdf/2307.06324v3.pdf","comment":"14pages plus references and appendix. Recent updates added more\n references"},{"id":"http://arxiv.org/abs/2307.08591v1","updated":"2023-07-17T16:01:22Z","published":"2023-07-17T16:01:22Z","title":"Snapshot Spectral Clustering -- a costless approach to deep clustering\n ensembles generation","summary":" Despite tremendous advancements in Artificial Intelligence, learning from\nlarge sets of data in an unsupervised manner remains a significant challenge.\nClassical clustering algorithms often fail to discover complex dependencies in\nlarge datasets, especially considering sparse, high-dimensional spaces.\nHowever, deep learning techniques proved to be successful when dealing with\nlarge quantities of data, efficiently reducing their dimensionality without\nlosing track of underlying information. Several interesting advancements have\nalready been made to combine deep learning and clustering. Still, the idea of\nenhancing the clustering results by combining multiple views of the data\ngenerated by deep neural networks appears to be insufficiently explored yet.\nThis paper aims to investigate this direction and bridge the gap between deep\nneural networks, clustering techniques and ensemble learning methods. To\nachieve this goal, we propose a novel deep clustering ensemble method -\nSnapshot Spectral Clustering, designed to maximize the gain from combining\nmultiple data views while minimizing the computational costs of creating the\nensemble. Comparative analysis and experiments described in this paper prove\nthe proposed concept, while the conducted hyperparameter study provides a\nvaluable intuition to follow when selecting proper values.\n","authors":["Adam Piróg","Halina Kwaśnicka"],"pdf_url":"https://arxiv.org/pdf/2307.08591v1.pdf","comment":"In proceedings of the International Joint Conference on Neural\n Networks 2023"},{"id":"http://arxiv.org/abs/2212.07158v2","updated":"2023-07-17T15:45:19Z","published":"2022-12-14T11:20:24Z","title":"Establishing a stronger baseline for lightweight contrastive models","summary":" Recent research has reported a performance degradation in self-supervised\ncontrastive learning for specially designed efficient networks, such as\nMobileNet and EfficientNet. A common practice to address this problem is to\nintroduce a pretrained contrastive teacher model and train the lightweight\nnetworks with distillation signals generated by the teacher. However, it is\ntime and resource consuming to pretrain a teacher model when it is not\navailable. In this work, we aim to establish a stronger baseline for\nlightweight contrastive models without using a pretrained teacher model.\nSpecifically, we show that the optimal recipe for efficient models is different\nfrom that of larger models, and using the same training settings as ResNet50,\nas previous research does, is inappropriate. Additionally, we observe a common\nissu e in contrastive learning where either the positive or negative views can\nbe noisy, and propose a smoothed version of InfoNCE loss to alleviate this\nproblem. As a result, we successfully improve the linear evaluation results\nfrom 36.3\\% to 62.3\\% for MobileNet-V3-Large and from 42.2\\% to 65.8\\% for\nEfficientNet-B0 on ImageNet, closing the accuracy gap to ResNet50 with\n$5\\times$ fewer parameters. We hope our research will facilitate the usage of\nlightweight contrastive models.\n","authors":["Wenye Lin","Yifeng Ding","Zhixiong Cao","Hai-tao Zheng"],"pdf_url":"https://arxiv.org/pdf/2212.07158v2.pdf","comment":"ICME 2023 oral"},{"id":"http://arxiv.org/abs/2307.08576v1","updated":"2023-07-17T15:44:13Z","published":"2023-07-17T15:44:13Z","title":"A Study on the Performance of Generative Pre-trained Transformer (GPT)\n in Simulating Depressed Individuals on the Standardized Depressive Symptom\n Scale","summary":" Background: Depression is a common mental disorder with societal and economic\nburden. Current diagnosis relies on self-reports and assessment scales, which\nhave reliability issues. Objective approaches are needed for diagnosing\ndepression. Objective: Evaluate the potential of GPT technology in diagnosing\ndepression. Assess its ability to simulate individuals with depression and\ninvestigate the influence of depression scales. Methods: Three\ndepression-related assessment tools (HAMD-17, SDS, GDS-15) were used. Two\nexperiments simulated GPT responses to normal individuals and individuals with\ndepression. Compare GPT's responses with expected results, assess its\nunderstanding of depressive symptoms, and performance differences under\ndifferent conditions. Results: GPT's performance in depression assessment was\nevaluated. It aligned with scoring criteria for both individuals with\ndepression and normal individuals. Some performance differences were observed\nbased on depression severity. GPT performed better on scales with higher\nsensitivity. Conclusion: GPT accurately simulates individuals with depression\nand normal individuals during depression-related assessments. Deviations occur\nwhen simulating different degrees of depression, limiting understanding of mild\nand moderate cases. GPT performs better on scales with higher sensitivity,\nindicating potential for developing more effective depression scales. GPT has\nimportant potential in depression assessment, supporting clinicians and\npatients.\n","authors":["Sijin Cai","Nanfeng Zhang","Jiaying Zhu","Yanjie Liu","Yongjin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.08576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08574v1","updated":"2023-07-17T15:40:45Z","published":"2023-07-17T15:40:45Z","title":"FedCME: Client Matching and Classifier Exchanging to Handle Data\n Heterogeneity in Federated Learning","summary":" Data heterogeneity across clients is one of the key challenges in Federated\nLearning (FL), which may slow down the global model convergence and even weaken\nglobal model performance. Most existing approaches tackle the heterogeneity by\nconstraining local model updates through reference to global information\nprovided by the server. This can alleviate the performance degradation on the\naggregated global model. Different from existing methods, we focus the\ninformation exchange between clients, which could also enhance the\neffectiveness of local training and lead to generate a high-performance global\nmodel. Concretely, we propose a novel FL framework named FedCME by client\nmatching and classifier exchanging. In FedCME, clients with large differences\nin data distribution will be matched in pairs, and then the corresponding pair\nof clients will exchange their classifiers at the stage of local training in an\nintermediate moment. Since the local data determines the local model training\ndirection, our method can correct update direction of classifiers and\neffectively alleviate local update divergence. Besides, we propose feature\nalignment to enhance the training of the feature extractor. Experimental\nresults demonstrate that FedCME performs better than FedAvg, FedProx, MOON and\nFedRS on popular federated learning benchmarks including FMNIST and CIFAR10, in\nthe case where data are heterogeneous.\n","authors":["Jun Nie","Danyang Xiao","Lei Yang","Weigang Wu"],"pdf_url":"https://arxiv.org/pdf/2307.08574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08572v1","updated":"2023-07-17T15:38:11Z","published":"2023-07-17T15:38:11Z","title":"Revisiting the Robustness of the Minimum Error Entropy Criterion: A\n Transfer Learning Case Study","summary":" Coping with distributional shifts is an important part of transfer learning\nmethods in order to perform well in real-life tasks. However, most of the\nexisting approaches in this area either focus on an ideal scenario in which the\ndata does not contain noises or employ a complicated training paradigm or model\ndesign to deal with distributional shifts. In this paper, we revisit the\nrobustness of the minimum error entropy (MEE) criterion, a widely used\nobjective in statistical signal processing to deal with non-Gaussian noises,\nand investigate its feasibility and usefulness in real-life transfer learning\nregression tasks, where distributional shifts are common. Specifically, we put\nforward a new theoretical result showing the robustness of MEE against\ncovariate shift. We also show that by simply replacing the mean squared error\n(MSE) loss with the MEE on basic transfer learning algorithms such as\nfine-tuning and linear probing, we can achieve competitive performance with\nrespect to state-of-the-art transfer learning algorithms. We justify our\narguments on both synthetic data and 5 real-world time-series data.\n","authors":["Luis Pedro Silvestrin","Shujian Yu","Mark Hoogendoorn"],"pdf_url":"https://arxiv.org/pdf/2307.08572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.02093v3","updated":"2023-07-17T15:16:56Z","published":"2022-07-05T14:55:16Z","title":"Predicting Out-of-Domain Generalization with Neighborhood Invariance","summary":" Developing and deploying machine learning models safely depends on the\nability to characterize and compare their abilities to generalize to new\nenvironments. Although recent work has proposed a variety of methods that can\ndirectly predict or theoretically bound the generalization capacity of a model,\nthey rely on strong assumptions such as matching train/test distributions and\naccess to model gradients. In order to characterize generalization when these\nassumptions are not satisfied, we propose neighborhood invariance, a measure of\na classifier's output invariance in a local transformation neighborhood.\nSpecifically, we sample a set of transformations and given an input test point,\ncalculate the invariance as the largest fraction of transformed points\nclassified into the same class. Crucially, our measure is simple to calculate,\ndoes not depend on the test point's true label, makes no assumptions about the\ndata distribution or model, and can be applied even in out-of-domain (OOD)\nsettings where existing methods cannot, requiring only selecting a set of\nappropriate data transformations. In experiments on robustness benchmarks in\nimage classification, sentiment analysis, and natural language inference, we\ndemonstrate a strong and robust correlation between our neighborhood invariance\nmeasure and actual OOD generalization on over 4,600 models evaluated on over\n100 unique train/test domain pairs.\n","authors":["Nathan Ng","Neha Hulkund","Kyunghyun Cho","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2207.02093v3.pdf","comment":"38 pages, 5 figures, 28 tables"},{"id":"http://arxiv.org/abs/2307.08558v1","updated":"2023-07-17T15:15:47Z","published":"2023-07-17T15:15:47Z","title":"Deep Learning with Passive Optical Nonlinear Mapping","summary":" Deep learning has fundamentally transformed artificial intelligence, but the\never-increasing complexity in deep learning models calls for specialized\nhardware accelerators. Optical accelerators can potentially offer enhanced\nperformance, scalability, and energy efficiency. However, achieving nonlinear\nmapping, a critical component of neural networks, remains challenging\noptically. Here, we introduce a design that leverages multiple scattering in a\nreverberating cavity to passively induce optical nonlinear random mapping,\nwithout the need for additional laser power. A key advantage emerging from our\nwork is that we show we can perform optical data compression, facilitated by\nmultiple scattering in the cavity, to efficiently compress and retain vital\ninformation while also decreasing data dimensionality. This allows rapid\noptical information processing and generation of low dimensional mixtures of\nhighly nonlinear features. These are particularly useful for applications\ndemanding high-speed analysis and responses such as in edge computing devices.\nUtilizing rapid optical information processing capabilities, our optical\nplatforms could potentially offer more efficient and real-time processing\nsolutions for a broad range of applications. We demonstrate the efficacy of our\ndesign in improving computational performance across tasks, including\nclassification, image reconstruction, key-point detection, and object\ndetection, all achieved through optical data compression combined with a\ndigital decoder. Notably, we observed high performance, at an extreme\ncompression ratio, for real-time pedestrian detection. Our findings pave the\nway for novel algorithms and architectural designs for optical computing.\n","authors":["Fei Xia","Kyungduk Kim","Yaniv Eliezer","Liam Shaughnessy","Sylvain Gigan","Hui Cao"],"pdf_url":"https://arxiv.org/pdf/2307.08558v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.08556v1","updated":"2023-07-17T15:15:26Z","published":"2023-07-17T15:15:26Z","title":"Machine-Learning-based Colorectal Tissue Classification via Acoustic\n Resolution Photoacoustic Microscopy","summary":" Colorectal cancer is a deadly disease that has become increasingly prevalent\nin recent years. Early detection is crucial for saving lives, but traditional\ndiagnostic methods such as colonoscopy and biopsy have limitations. Colonoscopy\ncannot provide detailed information within the tissues affected by cancer,\nwhile biopsy involves tissue removal, which can be painful and invasive. In\norder to improve diagnostic efficiency and reduce patient suffering, we studied\nmachine-learningbased approach for colorectal tissue classification that uses\nacoustic resolution photoacoustic microscopy (ARPAM). With this tool, we were\nable to classify benign and malignant tissue using multiple machine learning\nmethods. Our results were analyzed both quantitatively and qualitatively to\nevaluate the effectiveness of our approach.\n","authors":["Shangqing Tong","Peng Ge","Yanan Jiao","Zhaofu Ma","Ziye Li","Longhai Liu","Feng Gao","Xiaohui Du","Fei Gao"],"pdf_url":"https://arxiv.org/pdf/2307.08556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07246v2","updated":"2023-07-17T15:02:26Z","published":"2023-07-14T09:38:22Z","title":"Knowledge Boosting: Rethinking Medical Contrastive Vision-Language\n Pre-Training","summary":" The foundation models based on pre-training technology have significantly\nadvanced artificial intelligence from theoretical to practical applications.\nThese models have facilitated the feasibility of computer-aided diagnosis for\nwidespread use. Medical contrastive vision-language pre-training, which does\nnot require human annotations, is an effective approach for guiding\nrepresentation learning using description information in diagnostic reports.\nHowever, the effectiveness of pre-training is limited by the large-scale\nsemantic overlap and shifting problems in medical field. To address these\nissues, we propose the Knowledge-Boosting Contrastive Vision-Language\nPre-training framework (KoBo), which integrates clinical knowledge into the\nlearning of vision-language semantic consistency. The framework uses an\nunbiased, open-set sample-wise knowledge representation to measure negative\nsample noise and supplement the correspondence between vision-language mutual\ninformation and clinical knowledge. Extensive experiments validate the effect\nof our framework on eight tasks including classification, segmentation,\nretrieval, and semantic relatedness, achieving comparable or better performance\nwith the zero-shot or few-shot settings. Our code is open on\nhttps://github.com/ChenXiaoFei-CS/KoBo.\n","authors":["Xiaofei Chen","Yuting He","Cheng Xue","Rongjun Ge","Shuo Li","Guanyu Yang"],"pdf_url":"https://arxiv.org/pdf/2307.07246v2.pdf","comment":"accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.08535v1","updated":"2023-07-17T14:52:52Z","published":"2023-07-17T14:52:52Z","title":"Multi-class point cloud completion networks for 3D cardiac anatomy\n reconstruction from cine magnetic resonance images","summary":" Cine magnetic resonance imaging (MRI) is the current gold standard for the\nassessment of cardiac anatomy and function. However, it typically only acquires\na set of two-dimensional (2D) slices of the underlying three-dimensional (3D)\nanatomy of the heart, thus limiting the understanding and analysis of both\nhealthy and pathological cardiac morphology and physiology. In this paper, we\npropose a novel fully automatic surface reconstruction pipeline capable of\nreconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI\nacquisitions. Its key component is a multi-class point cloud completion network\n(PCCN) capable of correcting both the sparsity and misalignment issues of the\n3D reconstruction task in a unified model. We first evaluate the PCCN on a\nlarge synthetic dataset of biventricular anatomies and observe Chamfer\ndistances between reconstructed and gold standard anatomies below or similar to\nthe underlying image resolution for multiple levels of slice misalignment.\nFurthermore, we find a reduction in reconstruction error compared to a\nbenchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean\nsurface distance, respectively. We then apply the PCCN as part of our automated\nreconstruction pipeline to 1000 subjects from the UK Biobank study in a\ncross-domain transfer setting and demonstrate its ability to reconstruct\naccurate and topologically plausible biventricular heart meshes with clinical\nmetrics comparable to the previous literature. Finally, we investigate the\nrobustness of our proposed approach and observe its capacity to successfully\nhandle multiple common outlier conditions.\n","authors":["Marcel Beetz","Abhirup Banerjee","Julius Ossenberg-Engels","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.08535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08533v1","updated":"2023-07-17T14:49:06Z","published":"2023-07-17T14:49:06Z","title":"Nonlinear Processing with Linear Optics","summary":" Deep neural networks have achieved remarkable breakthroughs by leveraging\nmultiple layers of data processing to extract hidden representations, albeit at\nthe cost of large electronic computing power. To enhance energy efficiency and\nspeed, the optical implementation of neural networks aims to harness the\nadvantages of optical bandwidth and the energy efficiency of optical\ninterconnections. In the absence of low-power optical nonlinearities, the\nchallenge in the implementation of multilayer optical networks lies in\nrealizing multiple optical layers without resorting to electronic components.\nIn this study, we present a novel framework that uses multiple scattering that\nis capable of synthesizing programmable linear and nonlinear transformations\nconcurrently at low optical power by leveraging the nonlinear relationship\nbetween the scattering potential, represented by data, and the scattered field.\nTheoretical and experimental investigations show that repeating the data by\nmultiple scattering enables non-linear optical computing at low power\ncontinuous wave light.\n","authors":["Mustafa Yildirim","Niyazi Ulas Dinc","Ilker Oguz","Demetri Psaltis","Christophe Moser"],"pdf_url":"https://arxiv.org/pdf/2307.08533v1.pdf","comment":"20 pages, 9 figures and 1 table"},{"id":"http://arxiv.org/abs/2307.08532v1","updated":"2023-07-17T14:46:59Z","published":"2023-07-17T14:46:59Z","title":"LuckyMera: a Modular AI Framework for Building Hybrid NetHack Agents","summary":" In the last few decades we have witnessed a significant development in\nArtificial Intelligence (AI) thanks to the availability of a variety of\ntestbeds, mostly based on simulated environments and video games. Among those,\nroguelike games offer a very good trade-off in terms of complexity of the\nenvironment and computational costs, which makes them perfectly suited to test\nAI agents generalization capabilities. In this work, we present LuckyMera, a\nflexible, modular, extensible and configurable AI framework built around\nNetHack, a popular terminal-based, single-player roguelike video game. This\nlibrary is aimed at simplifying and speeding up the development of AI agents\ncapable of successfully playing the game and offering a high-level interface\nfor designing game strategies. LuckyMera comes with a set of off-the-shelf\nsymbolic and neural modules (called \"skills\"): these modules can be either\nhard-coded behaviors, or neural Reinforcement Learning approaches, with the\npossibility of creating compositional hybrid solutions. Additionally, LuckyMera\ncomes with a set of utility features to save its experiences in the form of\ntrajectories for further analysis and to use them as datasets to train neural\nmodules, with a direct interface to the NetHack Learning Environment and\nMiniHack. Through an empirical evaluation we validate our skills implementation\nand propose a strong baseline agent that can reach state-of-the-art\nperformances in the complete NetHack game. LuckyMera is open-source and\navailable at https://github.com/Pervasive-AI-Lab/LuckyMera.\n","authors":["Luigi Quarantiello","Simone Marzeddu","Antonio Guzzi","Vincenzo Lomonaco"],"pdf_url":"https://arxiv.org/pdf/2307.08532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08528v1","updated":"2023-07-17T14:40:16Z","published":"2023-07-17T14:40:16Z","title":"Multi-Domain Learning with Modulation Adapters","summary":" Deep convolutional networks are ubiquitous in computer vision, due to their\nexcellent performance across different tasks for various domains. Models are,\nhowever, often trained in isolation for each task, failing to exploit\nrelatedness between tasks and domains to learn more compact models that\ngeneralise better in low-data regimes. Multi-domain learning aims to handle\nrelated tasks, such as image classification across multiple domains,\nsimultaneously. Previous work on this problem explored the use of a pre-trained\nand fixed domain-agnostic base network, in combination with smaller learnable\ndomain-specific adaptation modules. In this paper, we introduce Modulation\nAdapters, which update the convolutional filter weights of the model in a\nmultiplicative manner for each task. Parameterising these adaptation weights in\na factored manner allows us to scale the number of per-task parameters in a\nflexible manner, and to strike different parameter-accuracy trade-offs. We\nevaluate our approach on the Visual Decathlon challenge, composed of ten image\nclassification tasks across different domains, and on the ImageNet-to-Sketch\nbenchmark, which consists of six image classification tasks. Our approach\nyields excellent results, with accuracies that are comparable to or better than\nthose of existing state-of-the-art approaches.\n","authors":["Ekaterina Iakovleva","Karteek Alahari","Jakob Verbeek"],"pdf_url":"https://arxiv.org/pdf/2307.08528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08526v1","updated":"2023-07-17T14:38:11Z","published":"2023-07-17T14:38:11Z","title":"Image Captions are Natural Prompts for Text-to-Image Models","summary":" With the rapid development of Artificial Intelligence Generated Content\n(AIGC), it has become common practice in many learning tasks to train or\nfine-tune large models on synthetic data due to the data-scarcity and privacy\nleakage problems. Albeit promising with unlimited data generation, owing to\nmassive and diverse information conveyed in real images, it is challenging for\ntext-to-image generative models to synthesize informative training data with\nhand-crafted prompts, which usually leads to inferior generalization\nperformance when training downstream models. In this paper, we theoretically\nanalyze the relationship between the training effect of synthetic data and the\nsynthetic data distribution induced by prompts. Then we correspondingly propose\na simple yet effective method that prompts text-to-image generative models to\nsynthesize more informative and diverse training data. Specifically, we caption\neach real image with the advanced captioning model to obtain informative and\nfaithful prompts that extract class-relevant information and clarify the\npolysemy of class names. The image captions and class names are concatenated to\nprompt generative models for training image synthesis. Extensive experiments on\nImageNette, ImageNet-100, and ImageNet-1K verify that our method significantly\nimproves the performance of models trained on synthetic training data, i.e.,\n10% classification accuracy improvements on average.\n","authors":["Shiye Lei","Hao Chen","Sen Zhang","Bo Zhao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.08526v1.pdf","comment":"20 pages, 1 figure, 10 tables"},{"id":"http://arxiv.org/abs/2307.08519v1","updated":"2023-07-17T14:27:32Z","published":"2023-07-17T14:27:32Z","title":"Results on Counterfactual Invariance","summary":" In this paper we provide a theoretical analysis of counterfactual invariance.\nWe present a variety of existing definitions, study how they relate to each\nother and what their graphical implications are. We then turn to the current\nmajor question surrounding counterfactual invariance, how does it relate to\nconditional independence? We show that whilst counterfactual invariance implies\nconditional independence, conditional independence does not give any\nimplications about the degree or likelihood of satisfying counterfactual\ninvariance. Furthermore, we show that for discrete causal models\ncounterfactually invariant functions are often constrained to be functions of\nparticular variables, or even constant.\n","authors":["Jake Fawkes","Robin J. Evans"],"pdf_url":"https://arxiv.org/pdf/2307.08519v1.pdf","comment":"5 pages with 6 pages of supplementary. Accepted at the ICML 2023\n workshop on Spurious Correlations, Invariance and Stability"},{"id":"http://arxiv.org/abs/2307.07119v2","updated":"2023-07-17T14:16:05Z","published":"2023-07-14T01:50:53Z","title":"DataAssist: A Machine Learning Approach to Data Cleaning and Preparation","summary":" Current automated machine learning (ML) tools are model-centric, focusing on\nmodel selection and parameter optimization. However, the majority of the time\nin data analysis is devoted to data cleaning and wrangling, for which limited\ntools are available. Here we present DataAssist, an automated data preparation\nand cleaning platform that enhances dataset quality using ML-informed methods.\nWe show that DataAssist provides a pipeline for exploratory data analysis and\ndata cleaning, including generating visualization for user-selected variables,\nunifying data annotation, suggesting anomaly removal, and preprocessing data.\nThe exported dataset can be readily integrated with other autoML tools or\nuser-specified model for downstream analysis. Our data-centric tool is\napplicable to a variety of fields, including economics, business, and\nforecasting applications saving over 50% time of the time spent on data\ncleansing and preparation.\n","authors":["Kartikay Goyle","Quin Xie","Vakul Goyle"],"pdf_url":"https://arxiv.org/pdf/2307.07119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08509v1","updated":"2023-07-17T14:10:01Z","published":"2023-07-17T14:10:01Z","title":"Kernel-Based Testing for Single-Cell Differential Analysis","summary":" Single-cell technologies have provided valuable insights into the\ndistribution of molecular features, such as gene expression and epigenomic\nmodifications. However, comparing these complex distributions in a controlled\nand powerful manner poses methodological challenges. Here we propose to benefit\nfrom the kernel-testing framework to compare the complex cell-wise\ndistributions of molecular features in a non-linear manner based on their\nkernel embedding. Our framework not only allows for feature-wise analyses but\nalso enables global comparisons of transcriptomes or epigenomes, considering\ntheir intricate dependencies. By using a classifier to discriminate cells based\non the variability of their embedding, our method uncovers heterogeneities in\ncell populations that would otherwise go undetected. We show that kernel\ntesting overcomes the limitations of differential analysis methods dedicated to\nsingle-cell. Kernel testing is applied to investigate the reversion process of\ndifferentiating cells, successfully identifying cells in transition between\nreversion and differentiation stages. Additionally, we analyze single-cell\nChIP-Seq data and identify a subpopulation of untreated breast cancer cells\nthat exhibit an epigenomic profile similar to persister cells.\n","authors":["Anthony Ozier-Lafontaine","Camille Fourneaux","Ghislain Durif","Céline Vallot","Olivier Gandrillon","Sandrine Giraud","Bertrand Michel","Franck Picard"],"pdf_url":"https://arxiv.org/pdf/2307.08509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08507v1","updated":"2023-07-17T14:09:43Z","published":"2023-07-17T14:09:43Z","title":"Efficient and Accurate Optimal Transport with Mirror Descent and\n Conjugate Gradients","summary":" We design a novel algorithm for optimal transport by drawing from the\nentropic optimal transport, mirror descent and conjugate gradients literatures.\nOur algorithm is able to compute optimal transport costs with arbitrary\naccuracy without running into numerical stability issues. The algorithm is\nimplemented efficiently on GPUs and is shown empirically to converge more\nquickly than traditional algorithms such as Sinkhorn's Algorithm both in terms\nof number of iterations and wall-clock time in many cases. We pay particular\nattention to the entropy of marginal distributions and show that high entropy\nmarginals make for harder optimal transport problems, for which our algorithm\nis a good fit. We provide a careful ablation analysis with respect to algorithm\nand problem parameters, and present benchmarking over the MNIST dataset. The\nresults suggest that our algorithm can be a useful addition to the\npractitioner's optimal transport toolkit. Our code is open-sourced at\nhttps://github.com/adaptive-agents-lab/MDOT-PNCG .\n","authors":["Mete Kemertas","Allan D. Jepson","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2307.08507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08506v1","updated":"2023-07-17T14:08:38Z","published":"2023-07-17T14:08:38Z","title":"Does Visual Pretraining Help End-to-End Reasoning?","summary":" We aim to investigate whether end-to-end learning of visual reasoning can be\nachieved with general-purpose neural networks, with the help of visual\npretraining. A positive result would refute the common belief that explicit\nvisual abstraction (e.g. object detection) is essential for compositional\ngeneralization on visual reasoning, and confirm the feasibility of a neural\nnetwork \"generalist\" to solve visual recognition and reasoning tasks. We\npropose a simple and general self-supervised framework which \"compresses\" each\nvideo frame into a small set of tokens with a transformer network, and\nreconstructs the remaining frames based on the compressed temporal context. To\nminimize the reconstruction loss, the network must learn a compact\nrepresentation for each image, as well as capture temporal dynamics and object\npermanence from temporal context. We perform evaluation on two visual reasoning\nbenchmarks, CATER and ACRE. We observe that pretraining is essential to achieve\ncompositional generalization for end-to-end visual reasoning. Our proposed\nframework outperforms traditional supervised pretraining, including image\nclassification and explicit object detection, by large margins.\n","authors":["Chen Sun","Calvin Luo","Xingyi Zhou","Anurag Arnab","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2307.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08496v1","updated":"2023-07-17T13:59:07Z","published":"2023-07-17T13:59:07Z","title":"Can We Trust Race Prediction?","summary":" In the absence of sensitive race and ethnicity data, researchers, regulators,\nand firms alike turn to proxies. In this paper, I train a Bidirectional Long\nShort-Term Memory (BiLSTM) model on a novel dataset of voter registration data\nfrom all 50 US states and create an ensemble that achieves up to 36.8% higher\nout of sample (OOS) F1 scores than the best performing machine learning models\nin the literature. Additionally, I construct the most comprehensive database of\nfirst and surname distributions in the US in order to improve the coverage and\naccuracy of Bayesian Improved Surname Geocoding (BISG) and Bayesian Improved\nFirstname Surname Geocoding (BIFSG). Finally, I provide the first high-quality\nbenchmark dataset in order to fairly compare existing models and aid future\nmodel developers.\n","authors":["Cangyuan Li"],"pdf_url":"https://arxiv.org/pdf/2307.08496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05812v2","updated":"2023-07-17T13:56:40Z","published":"2023-05-09T23:51:24Z","title":"Assessment of Reinforcement Learning Algorithms for Nuclear Power Plant\n Fuel Optimization","summary":" The nuclear fuel loading pattern optimization problem belongs to the class of\nlarge-scale combinatorial optimization. It is also characterized by multiple\nobjectives and constraints, which makes it impossible to solve explicitly.\nStochastic optimization methodologies including Genetic Algorithms and\nSimulated Annealing are used by different nuclear utilities and vendors, but\nhand-designed solutions continue to be the prevalent method in the industry. To\nimprove the state-of-the-art, Deep Reinforcement Learning (RL), in particular,\nProximal Policy Optimization is leveraged. This work presents a first-of-a-kind\napproach to utilize deep RL to solve the loading pattern problem and could be\nleveraged for any engineering design optimization. This paper is also to our\nknowledge the first to propose a study of the behavior of several\nhyper-parameters that influence the RL algorithm. The algorithm is highly\ndependent on multiple factors such as the shape of the objective function\nderived for the core design that behaves as a fudge factor that affects the\nstability of the learning. But also, an exploration/exploitation trade-off that\nmanifests through different parameters such as the number of loading patterns\nseen by the agents per episode, the number of samples collected before a policy\nupdate nsteps, and an entropy factor ent_coef that increases the randomness of\nthe policy during training. We found that RL must be applied similarly to a\nGaussian Process in which the acquisition function is replaced by a\nparametrized policy. Then, once an initial set of hyper-parameters is found,\nreducing nsteps and ent_coef until no more learning is observed will result in\nthe highest sample efficiency robustly and stably. This resulted in an economic\nbenefit of 535,000- 642,000 $/year/plant.\n","authors":["Paul Seurin","Koroush Shirvan"],"pdf_url":"https://arxiv.org/pdf/2305.05812v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08486v1","updated":"2023-07-17T13:48:27Z","published":"2023-07-17T13:48:27Z","title":"Fairness in KI-Systemen","summary":" The more AI-assisted decisions affect people's lives, the more important the\nfairness of such decisions becomes. In this chapter, we provide an introduction\nto research on fairness in machine learning. We explain the main fairness\ndefinitions and strategies for achieving fairness using concrete examples and\nplace fairness research in the European context. Our contribution is aimed at\nan interdisciplinary audience and therefore avoids mathematical formulation but\nemphasizes visualizations and examples.\n --\n Je mehr KI-gest\\\"utzte Entscheidungen das Leben von Menschen betreffen, desto\nwichtiger ist die Fairness solcher Entscheidungen. In diesem Kapitel geben wir\neine Einf\\\"uhrung in die Forschung zu Fairness im maschinellen Lernen. Wir\nerkl\\\"aren die wesentlichen Fairness-Definitionen und Strategien zur Erreichung\nvon Fairness anhand konkreter Beispiele und ordnen die Fairness-Forschung in\nden europ\\\"aischen Kontext ein. Unser Beitrag richtet sich dabei an ein\ninterdisziplin\\\"ares Publikum und verzichtet daher auf die mathematische\nFormulierung sondern betont Visualisierungen und Beispiele.\n","authors":["Janine Strotherm","Alissa Müller","Barbara Hammer","Benjamin Paaßen"],"pdf_url":"https://arxiv.org/pdf/2307.08486v1.pdf","comment":"in German language"},{"id":"http://arxiv.org/abs/2307.08485v1","updated":"2023-07-17T13:47:41Z","published":"2023-07-17T13:47:41Z","title":"Cross Feature Selection to Eliminate Spurious Interactions and Single\n Feature Dominance Explainable Boosting Machines","summary":" Interpretability is a crucial aspect of machine learning models that enables\nhumans to understand and trust the decision-making process of these models. In\nmany real-world applications, the interpretability of models is essential for\nlegal, ethical, and practical reasons. For instance, in the banking domain,\ninterpretability is critical for lenders and borrowers to understand the\nreasoning behind the acceptance or rejection of loan applications as per fair\nlending laws. However, achieving interpretability in machine learning models is\nchallenging, especially for complex high-performance models. Hence Explainable\nBoosting Machines (EBMs) have been gaining popularity due to their\ninterpretable and high-performance nature in various prediction tasks. However,\nthese models can suffer from issues such as spurious interactions with\nredundant features and single-feature dominance across all interactions, which\ncan affect the interpretability and reliability of the model's predictions. In\nthis paper, we explore novel approaches to address these issues by utilizing\nalternate Cross-feature selection, ensemble features and model configuration\nalteration techniques. Our approach involves a multi-step feature selection\nprocedure that selects a set of candidate features, ensemble features and then\nbenchmark the same using the EBM model. We evaluate our method on three\nbenchmark datasets and show that the alternate techniques outperform vanilla\nEBM methods, while providing better interpretability and feature selection\nstability, and improving the model's predictive performance. Moreover, we show\nthat our approach can identify meaningful interactions and reduce the dominance\nof single features in the model's predictions, leading to more reliable and\ninterpretable models.\n Index Terms- Interpretability, EBM's, ensemble, feature selection.\n","authors":["Shree Charran R","Sandipan Das Mahapatra"],"pdf_url":"https://arxiv.org/pdf/2307.08485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14442v2","updated":"2023-07-17T13:38:18Z","published":"2023-05-23T18:07:44Z","title":"Optimal Preconditioning and Fisher Adaptive Langevin Sampling","summary":" We define an optimal preconditioning for the Langevin diffusion by\nanalytically optimizing the expected squared jumped distance. This yields as\nthe optimal preconditioning an inverse Fisher information covariance matrix,\nwhere the covariance matrix is computed as the outer product of log target\ngradients averaged under the target. We apply this result to the Metropolis\nadjusted Langevin algorithm (MALA) and derive a computationally efficient\nadaptive MCMC scheme that learns the preconditioning from the history of\ngradients produced as the algorithm runs. We show in several experiments that\nthe proposed algorithm is very robust in high dimensions and significantly\noutperforms other methods, including a closely related adaptive MALA scheme\nthat learns the preconditioning with standard adaptive MCMC as well as the\nposition-dependent Riemannian manifold MALA sampler.\n","authors":["Michalis K. Titsias"],"pdf_url":"https://arxiv.org/pdf/2305.14442v2.pdf","comment":"21 pages, 15 figures"},{"id":"http://arxiv.org/abs/2307.08474v1","updated":"2023-07-17T13:32:02Z","published":"2023-07-17T13:32:02Z","title":"A Fast Task Offloading Optimization Framework for IRS-Assisted\n Multi-Access Edge Computing System","summary":" Terahertz communication networks and intelligent reflecting surfaces exhibit\nsignificant potential in advancing wireless networks, particularly within the\ndomain of aerial-based multi-access edge computing systems. These technologies\nenable efficient offloading of computational tasks from user electronic devices\nto Unmanned Aerial Vehicles or local execution. For the generation of\nhigh-quality task-offloading allocations, conventional numerical optimization\nmethods often struggle to solve challenging combinatorial optimization problems\nwithin the limited channel coherence time, thereby failing to respond quickly\nto dynamic changes in system conditions. To address this challenge, we propose\na deep learning-based optimization framework called Iterative Order-Preserving\npolicy Optimization (IOPO), which enables the generation of energy-efficient\ntask-offloading decisions within milliseconds. Unlike exhaustive search\nmethods, IOPO provides continuous updates to the offloading decisions without\nresorting to exhaustive search, resulting in accelerated convergence and\nreduced computational complexity, particularly when dealing with complex\nproblems characterized by extensive solution spaces. Experimental results\ndemonstrate that the proposed framework can generate energy-efficient\ntask-offloading decisions within a very short time period, outperforming other\nbenchmark methods.\n","authors":["Jianqiu Wu","Zhongyi Yu","Jianxiong Guo","Zhiqing Tang","Tian Wang","Weijia Jia"],"pdf_url":"https://arxiv.org/pdf/2307.08474v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2307.08466v1","updated":"2023-07-17T13:21:02Z","published":"2023-07-17T13:21:02Z","title":"Classification of UHF Partial Discharge Signals in Gas-Insulated HVDC\n Systems Using Neural Networks","summary":" Undetected partial discharges (PDs) are a safety critical issue in high\nvoltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC\nvoltage is well-established, the analysis of PDs under DC voltage remains an\nactive research field. A key focus of these investigations is the\nclassification of different PD sources to enable subsequent sophisticated\nanalysis.\n In this paper, we propose and analyze a neural network-based approach for\nclassifying PD signals caused by metallic protrusions and conductive particles\non the insulator of HVDC GIS, without relying on pulse sequence analysis\nfeatures. In contrast to previous approaches, our proposed model can\ndiscriminate the studied PD signals obtained at negative and positive\npotentials, while also generalizing to unseen operating voltage multiples.\nAdditionally, we compare the performance of time- and frequency-domain input\nsignals and explore the impact of different normalization schemes to mitigate\nthe influence of free-space path loss between the sensor and defect location.\n","authors":["Steffen Seitz","Thomas Götz","Christopher Lindenberg","Ronald Tetzlaff","Stephan Schlegel"],"pdf_url":"https://arxiv.org/pdf/2307.08466v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.08452v1","updated":"2023-07-17T12:47:33Z","published":"2023-07-17T12:47:33Z","title":"SBMLtoODEjax: efficient simulation and optimization of ODE SBML models\n in JAX","summary":" Developing methods to explore, predict and control the dynamic behavior of\nbiological systems, from protein pathways to complex cellular processes, is an\nessential frontier of research for bioengineering and biomedicine. Thus,\nsignificant effort has gone in computational inference and mathematical\nmodeling of biological systems. This effort has resulted in the development of\nlarge collections of publicly-available models, typically stored and exchanged\non online platforms (such as the BioModels Database) using the Systems Biology\nMarkup Language (SBML), a standard format for representing mathematical models\nof biological systems. SBMLtoODEjax is a lightweight library that allows to\nautomatically parse and convert SBML models into python models written\nend-to-end in JAX, a high-performance numerical computing library with\nautomatic differentiation capabilities. SBMLtoODEjax is targeted at researchers\nthat aim to incorporate SBML-specified ordinary differential equation (ODE)\nmodels into their python projects and machine learning pipelines, in order to\nperform efficient numerical simulation and optimization with only a few lines\nof code. SBMLtoODEjax is available at\nhttps://github.com/flowersteam/sbmltoodejax.\n","authors":["Mayalen Etcheverry","Michael Levin","Clément Moulin-Frier","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2307.08452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08691v2","updated":"2023-07-17T12:44:50Z","published":"2023-03-15T15:21:41Z","title":"Learning to Reconstruct Signals From Binary Measurements","summary":" Recent advances in unsupervised learning have highlighted the possibility of\nlearning to reconstruct signals from noisy and incomplete linear measurements\nalone. These methods play a key role in medical and scientific imaging and\nsensing, where ground truth data is often scarce or difficult to obtain.\nHowever, in practice, measurements are not only noisy and incomplete but also\nquantized. Here we explore the extreme case of learning from binary\nobservations and provide necessary and sufficient conditions on the number of\nmeasurements required for identifying a set of signals from incomplete binary\ndata. Our results are complementary to existing bounds on signal recovery from\nbinary measurements. Furthermore, we introduce a novel self-supervised learning\napproach, which we name SSBM, that only requires binary data for training. We\ndemonstrate in a series of experiments with real datasets that SSBM performs on\npar with supervised learning and outperforms sparse reconstruction methods with\na fixed wavelet basis by a large margin.\n","authors":["Julián Tachella","Laurent Jacques"],"pdf_url":"https://arxiv.org/pdf/2303.08691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02641v3","updated":"2023-07-17T12:41:56Z","published":"2022-10-25T04:48:11Z","title":"Graph Neural Networks on SPD Manifolds for Motor Imagery Classification:\n A Perspective from the Time-Frequency Analysis","summary":" The motor imagery (MI) classification has been a prominent research topic in\nbrain-computer interfaces based on electroencephalography (EEG). Over the past\nfew decades, the performance of MI-EEG classifiers has gradually improved. In\nthis study, we enhance the geometric deep learning classifier for MI-EEG\nclassification from the perspective of time-frequency analysis, introducing a\nnew architecture called Graph-CSPNet. We refer to this category of classifiers\nas geometric methods, emphasizing their rich background in differential\ngeometry induced by signal covariance matrices. Graph-CSPNet utilizes a novel\nSPD matrix-valued graph convolutional techniques to capture the EEG features in\nthe time-frequency domain, providing greater flexibility in signal segmentation\nand capturing localized fluctuations. To evaluate the effectiveness of\nGraph-CSPNet, we employ five commonly-used publicly available MI-EEG datasets,\nachieving near-optimal classification accuracies in nine out of eleven\nscenarios. The Python repository can be found at\nhttps://github.com/GeometricBCI/Tensor-CSPNet-and-Graph-CSPNet\n","authors":["Ce Ju","Cuntai Guan"],"pdf_url":"https://arxiv.org/pdf/2211.02641v3.pdf","comment":"15 pages, 5 figures, 6 Tables; This work has been submitted to the\n IEEE for possible publication. Copyright may be transferred without notice,\n after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2210.13815v2","updated":"2023-07-17T12:33:11Z","published":"2022-10-25T07:41:57Z","title":"FocusedCleaner: Sanitizing Poisoned Graphs for Robust GNN-based Node\n Classification","summary":" Graph Neural Networks (GNNs) are vulnerable to data poisoning attacks, which\nwill generate a poisoned graph as the input to the GNN models. We present\nFocusedCleaner as a poisoned graph sanitizer to effectively identify the poison\ninjected by attackers. Specifically, FocusedCleaner provides a sanitation\nframework consisting of two modules: bi-level structural learning and victim\nnode detection. In particular, the structural learning module will reverse the\nattack process to steadily sanitize the graph while the detection module\nprovides ``the focus\" -- a narrowed and more accurate search region -- to\nstructural learning. These two modules will operate in iterations and reinforce\neach other to sanitize a poisoned graph step by step. As an important\napplication, we show that the adversarial robustness of GNNs trained over the\nsanitized graph for the node classification task is significantly improved.\nExtensive experiments demonstrate that FocusedCleaner outperforms the\nstate-of-the-art baselines both on poisoned graph sanitation and improving\nrobustness.\n","authors":["Yulin Zhu","Liang Tong","Gaolei Li","Xiapu Luo","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2210.13815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08433v1","updated":"2023-07-17T12:25:52Z","published":"2023-07-17T12:25:52Z","title":"From random-walks to graph-sprints: a low-latency node embedding\n framework on continuous-time dynamic graphs","summary":" Many real-world datasets have an underlying dynamic graph structure, where\nentities and their interactions evolve over time. Machine learning models\nshould consider these dynamics in order to harness their full potential in\ndownstream tasks. Previous approaches for graph representation learning have\nfocused on either sampling k-hop neighborhoods, akin to breadth-first search,\nor random walks, akin to depth-first search. However, these methods are\ncomputationally expensive and unsuitable for real-time, low-latency inference\non dynamic graphs. To overcome these limitations, we propose graph-sprints a\ngeneral purpose feature extraction framework for continuous-time-dynamic-graphs\n(CTDGs) that has low latency and is competitive with state-of-the-art, higher\nlatency models. To achieve this, a streaming, low latency approximation to the\nrandom-walk based features is proposed. In our framework, time-aware node\nembeddings summarizing multi-hop information are computed using only single-hop\noperations on the incoming edges. We evaluate our proposed approach on three\nopen-source datasets and two in-house datasets, and compare with three\nstate-of-the-art algorithms (TGN-attn, TGN-ID, Jodie). We demonstrate that our\ngraph-sprints features, combined with a machine learning classifier, achieve\ncompetitive performance (outperforming all baselines for the node\nclassification tasks in five datasets). Simultaneously, graph-sprints\nsignificantly reduce inference latencies, achieving close to an order of\nmagnitude speed-up in our experimental setting.\n","authors":["Ahmad Naser Eddin","Jacopo Bono","David Aparício","Hugo Ferreira","João Ascensão","Pedro Ribeiro","Pedro Bizarro"],"pdf_url":"https://arxiv.org/pdf/2307.08433v1.pdf","comment":"9 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2212.01793v3","updated":"2023-07-17T12:16:57Z","published":"2022-12-04T10:45:42Z","title":"kHGCN: Tree-likeness Modeling via Continuous and Discrete Curvature\n Learning","summary":" The prevalence of tree-like structures, encompassing hierarchical structures\nand power law distributions, exists extensively in real-world applications,\nincluding recommendation systems, ecosystems, financial networks, social\nnetworks, etc. Recently, the exploitation of hyperbolic space for tree-likeness\nmodeling has garnered considerable attention owing to its exponential growth\nvolume. Compared to the flat Euclidean space, the curved hyperbolic space\nprovides a more amenable and embeddable room, especially for datasets\nexhibiting implicit tree-like architectures. However, the intricate nature of\nreal-world tree-like data presents a considerable challenge, as it frequently\ndisplays a heterogeneous composition of tree-like, flat, and circular regions.\nThe direct embedding of such heterogeneous structures into a homogeneous\nembedding space (i.e., hyperbolic space) inevitably leads to heavy distortions.\nTo mitigate the aforementioned shortage, this study endeavors to explore the\ncurvature between discrete structure and continuous learning space, aiming at\nencoding the message conveyed by the network topology in the learning process,\nthereby improving tree-likeness modeling. To the end, a curvature-aware\nhyperbolic graph convolutional neural network, \\{kappa}HGCN, is proposed, which\nutilizes the curvature to guide message passing and improve long-range\npropagation. Extensive experiments on node classification and link prediction\ntasks verify the superiority of the proposal as it consistently outperforms\nvarious competitive models by a large margin.\n","authors":["Menglin Yang","Min Zhou","Lujia Pan","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2212.01793v3.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2307.08423v1","updated":"2023-07-17T12:14:14Z","published":"2023-07-17T12:14:14Z","title":"Artificial Intelligence for Science in Quantum, Atomistic, and Continuum\n Systems","summary":" Advances in artificial intelligence (AI) are fueling a new paradigm of\ndiscoveries in natural sciences. Today, AI has started to advance natural\nsciences by improving, accelerating, and enabling our understanding of natural\nphenomena at a wide range of spatial and temporal scales, giving rise to a new\narea of research known as AI for science (AI4Science). Being an emerging\nresearch paradigm, AI4Science is unique in that it is an enormous and highly\ninterdisciplinary area. Thus, a unified and technical treatment of this field\nis needed yet challenging. This paper aims to provide a technically thorough\naccount of a subarea of AI4Science; namely, AI for quantum, atomistic, and\ncontinuum systems. These areas aim at understanding the physical world from the\nsubatomic (wavefunctions and electron density), atomic (molecules, proteins,\nmaterials, and interactions), to macro (fluids, climate, and subsurface) scales\nand form an important subarea of AI4Science. A unique advantage of focusing on\nthese areas is that they largely share a common set of challenges, thereby\nallowing a unified and foundational treatment. A key common challenge is how to\ncapture physics first principles, especially symmetries, in natural systems by\ndeep learning methods. We provide an in-depth yet intuitive account of\ntechniques to achieve equivariance to symmetry transformations. We also discuss\nother common technical challenges, including explainability,\nout-of-distribution generalization, knowledge transfer with foundation and\nlarge language models, and uncertainty quantification. To facilitate learning\nand education, we provide categorized lists of resources that we found to be\nuseful. We strive to be thorough and unified and hope this initial effort may\ntrigger more community interests and efforts to further advance AI4Science.\n","authors":["Xuan Zhang","Limei Wang","Jacob Helwig","Youzhi Luo","Cong Fu","Yaochen Xie","Meng Liu","Yuchao Lin","Zhao Xu","Keqiang Yan","Keir Adams","Maurice Weiler","Xiner Li","Tianfan Fu","Yucheng Wang","Haiyang Yu","YuQing Xie","Xiang Fu","Alex Strasser","Shenglong Xu","Yi Liu","Yuanqi Du","Alexandra Saxton","Hongyi Ling","Hannah Lawrence","Hannes Stärk","Shurui Gui","Carl Edwards","Nicholas Gao","Adriana Ladera","Tailin Wu","Elyssa F. Hofgard","Aria Mansouri Tehrani","Rui Wang","Ameya Daigavane","Montgomery Bohde","Jerry Kurtin","Qian Huang","Tuong Phung","Minkai Xu","Chaitanya K. Joshi","Simon V. Mathis","Kamyar Azizzadenesheli","Ada Fang","Alán Aspuru-Guzik","Erik Bekkers","Michael Bronstein","Marinka Zitnik","Anima Anandkumar","Stefano Ermon","Pietro Liò","Rose Yu","Stephan Günnemann","Jure Leskovec","Heng Ji","Jimeng Sun","Regina Barzilay","Tommi Jaakkola","Connor W. Coley","Xiaoning Qian","Xiaofeng Qian","Tess Smidt","Shuiwang Ji"],"pdf_url":"https://arxiv.org/pdf/2307.08423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10893v3","updated":"2023-07-17T12:13:46Z","published":"2023-02-07T18:25:28Z","title":"Fair Diffusion: Instructing Text-to-Image Generation Models on Fairness","summary":" Generative AI models have recently achieved astonishing results in quality\nand are consequently employed in a fast-growing number of applications.\nHowever, since they are highly data-driven, relying on billion-sized datasets\nrandomly scraped from the internet, they also suffer from degenerated and\nbiased human behavior, as we demonstrate. In fact, they may even reinforce such\nbiases. To not only uncover but also combat these undesired effects, we present\na novel strategy, called Fair Diffusion, to attenuate biases after the\ndeployment of generative text-to-image models. Specifically, we demonstrate\nshifting a bias, based on human instructions, in any direction yielding\narbitrarily new proportions for, e.g., identity groups. As our empirical\nevaluation demonstrates, this introduced control enables instructing generative\nimage models on fairness, with no data filtering and additional training\nrequired.\n","authors":["Felix Friedrich","Manuel Brack","Lukas Struppek","Dominik Hintersdorf","Patrick Schramowski","Sasha Luccioni","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2302.10893v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08411v1","updated":"2023-07-17T11:47:05Z","published":"2023-07-17T11:47:05Z","title":"Neurosymbolic AI for Reasoning on Biomedical Knowledge Graphs","summary":" Biomedical datasets are often modeled as knowledge graphs (KGs) because they\ncapture the multi-relational, heterogeneous, and dynamic natures of biomedical\nsystems. KG completion (KGC), can, therefore, help researchers make predictions\nto inform tasks like drug repositioning. While previous approaches for KGC were\neither rule-based or embedding-based, hybrid approaches based on neurosymbolic\nartificial intelligence are becoming more popular. Many of these methods\npossess unique characteristics which make them even better suited toward\nbiomedical challenges. Here, we survey such approaches with an emphasis on\ntheir utilities and prospective benefits for biomedicine.\n","authors":["Lauren Nicole DeLong","Ramon Fernández Mir","Zonglin Ji","Fiona Niamh Coulter Smith","Jacques D. Fleuriot"],"pdf_url":"https://arxiv.org/pdf/2307.08411v1.pdf","comment":"Proceedings of the $\\mathit{40}^{th}$ International Conference on\n Machine Learning: Workshop on Knowledge and Logical Reasoning in the Era of\n Data-driven Learning (https://klr-icml2023.github.io/schedule.html). PMLR\n 202, 2023. Condensed, workshop-ready version of previous survey,\n arXiv:2302.07200 , which is under review. 13 pages (9 content, 4 references),\n 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.08403v1","updated":"2023-07-17T11:38:35Z","published":"2023-07-17T11:38:35Z","title":"Vocoder drift compensation by x-vector alignment in speaker\n anonymisation","summary":" For the most popular x-vector-based approaches to speaker anonymisation, the\nbulk of the anonymisation can stem from vocoding rather than from the core\nanonymisation function which is used to substitute an original speaker x-vector\nwith that of a fictitious pseudo-speaker. This phenomenon can impede the design\nof better anonymisation systems since there is a lack of fine-grained control\nover the x-vector space. The work reported in this paper explores the origin of\nso-called vocoder drift and shows that it is due to the mismatch between the\nsubstituted x-vector and the original representations of the linguistic\ncontent, intonation and prosody. Also reported is an original approach to\nvocoder drift compensation. While anonymisation performance degrades as\nexpected, compensation reduces vocoder drift substantially, offers improved\ncontrol over the x-vector space and lays a foundation for the design of better\nanonymisation functions in the future.\n","authors":["Michele Panariello","Massimiliano Todisco","Nicholas Evans"],"pdf_url":"https://arxiv.org/pdf/2307.08403v1.pdf","comment":"Accepted at the ISCA SPSC Symposium 2023"},{"id":"http://arxiv.org/abs/2302.13262v2","updated":"2023-07-17T11:27:45Z","published":"2023-02-26T08:26:01Z","title":"Modulated Neural ODEs","summary":" Neural ordinary differential equations (NODEs) have been proven useful for\nlearning non-linear dynamics of arbitrary trajectories. However, current NODE\nmethods capture variations across trajectories only via the initial state value\nor by auto-regressive encoder updates. In this work, we introduce Modulated\nNeural ODEs (MoNODEs), a novel framework that sets apart dynamics states from\nunderlying static factors of variation and improves the existing NODE methods.\nIn particular, we introduce $\\textit{time-invariant modulator variables}$ that\nare learned from the data. We incorporate our proposed framework into four\nexisting NODE variants. We test MoNODE on oscillating systems, videos and human\nwalking trajectories, where each trajectory has trajectory-specific modulation.\nOur framework consistently improves the existing model ability to generalize to\nnew dynamic parameterizations and to perform far-horizon forecasting. In\naddition, we verify that the proposed modulator variables are informative of\nthe true unknown factors of variation as measured by $R^2$ scores.\n","authors":["Ilze Amanda Auzina","Çağatay Yıldız","Sara Magliacane","Matthias Bethge","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2302.13262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08393v1","updated":"2023-07-17T11:12:56Z","published":"2023-07-17T11:12:56Z","title":"On the application of Large Language Models for language teaching and\n assessment technology","summary":" The recent release of very large language models such as PaLM and GPT-4 has\nmade an unprecedented impact in the popular media and public consciousness,\ngiving rise to a mixture of excitement and fear as to their capabilities and\npotential uses, and shining a light on natural language processing research\nwhich had not previously received so much attention. The developments offer\ngreat promise for education technology, and in this paper we look specifically\nat the potential for incorporating large language models in AI-driven language\nteaching and assessment systems. We consider several research areas and also\ndiscuss the risks and ethical considerations surrounding generative AI in\neducation technology for language learners. Overall we find that larger\nlanguage models offer improvements over previous models in text generation,\nopening up routes toward content generation which had not previously been\nplausible. For text generation they must be prompted carefully and their\noutputs may need to be reshaped before they are ready for use. For automated\ngrading and grammatical error correction, tasks whose progress is checked on\nwell-known benchmarks, early investigations indicate that large language models\non their own do not improve on state-of-the-art results according to standard\nevaluation metrics. For grading it appears that linguistic features established\nin the literature should still be used for best performance, and for error\ncorrection it may be that the models can offer alternative feedback styles\nwhich are not measured sensitively with existing methods. In all cases, there\nis work to be done to experiment with the inclusion of large language models in\neducation technology for language learners, in order to properly understand and\nreport on their capacities and limitations, and to ensure that foreseeable\nrisks such as misinformation and harmful bias are mitigated.\n","authors":["Andrew Caines","Luca Benedetto","Shiva Taslimipoor","Christopher Davis","Yuan Gao","Oeistein Andersen","Zheng Yuan","Mark Elliott","Russell Moore","Christopher Bryant","Marek Rei","Helen Yannakoudakis","Andrew Mullooly","Diane Nicholls","Paula Buttery"],"pdf_url":"https://arxiv.org/pdf/2307.08393v1.pdf","comment":"Accepted at the AIED2023 workshop: Empowering Education with LLMs -\n the Next-Gen Interface and Content Generation"},{"id":"http://arxiv.org/abs/2307.08390v1","updated":"2023-07-17T11:04:27Z","published":"2023-07-17T11:04:27Z","title":"Correlation-aware Spatial-Temporal Graph Learning for Multivariate\n Time-series Anomaly Detection","summary":" Multivariate time-series anomaly detection is critically important in many\napplications, including retail, transportation, power grid, and water treatment\nplants. Existing approaches for this problem mostly employ either statistical\nmodels which cannot capture the non-linear relations well or conventional deep\nlearning models (e.g., CNN and LSTM) that do not explicitly learn the pairwise\ncorrelations among variables. To overcome these limitations, we propose a novel\nmethod, correlation-aware spatial-temporal graph learning (termed CST-GL), for\ntime series anomaly detection. CST-GL explicitly captures the pairwise\ncorrelations via a multivariate time series correlation learning module based\non which a spatial-temporal graph neural network (STGNN) can be developed.\nThen, by employing a graph convolution network that exploits one- and multi-hop\nneighbor information, our STGNN component can encode rich spatial information\nfrom complex pairwise dependencies between variables. With a temporal module\nthat consists of dilated convolutional functions, the STGNN can further capture\nlong-range dependence over time. A novel anomaly scoring component is further\nintegrated into CST-GL to estimate the degree of an anomaly in a purely\nunsupervised manner. Experimental results demonstrate that CST-GL can detect\nanomalies effectively in general settings as well as enable early detection\nacross different time delays.\n","authors":["Yu Zheng","Huan Yee Koh","Ming Jin","Lianhua Chi","Khoa T. Phan","Shirui Pan","Yi-Ping Phoebe Chen","Wei Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.08390v1.pdf","comment":"17 pages, double columns, 10 tables, 3 figures"},{"id":"http://arxiv.org/abs/2209.05379v4","updated":"2023-07-17T10:53:03Z","published":"2022-09-12T16:31:34Z","title":"Action-based Early Autism Diagnosis Using Contrastive Feature Learning","summary":" Autism, also known as Autism Spectrum Disorder (or ASD), is a neurological\ndisorder. Its main symptoms include difficulty in (verbal and/or non-verbal)\ncommunication, and rigid/repetitive behavior. These symptoms are often\nindistinguishable from a normal (control) individual, due to which this\ndisorder remains undiagnosed in early childhood leading to delayed treatment.\nSince the learning curve is steep during the initial age, an early diagnosis of\nautism could allow to take adequate interventions at the right time, which\nmight positively affect the growth of an autistic child. Further, the\ntraditional methods of autism diagnosis require multiple visits to a\nspecialized psychiatrist, however this process can be time-consuming. In this\npaper, we present a learning based approach to automate autism diagnosis using\nsimple and small action video clips of subjects. This task is particularly\nchallenging because the amount of annotated data available is small, and the\nvariations among samples from the two categories (ASD and control) are\ngenerally indistinguishable. This is also evident from poor performance of a\nbinary classifier learned using the cross-entropy loss on top of a baseline\nencoder. To address this, we adopt contrastive feature learning in both self\nsupervised and supervised learning frameworks, and show that these can lead to\na significant increase in the prediction accuracy of a binary classifier on\nthis task. We further validate this by conducting thorough experimental\nanalyses under different set-ups on two publicly available datasets.\n","authors":["Asha Rani","Pankaj Yadav","Yashaswi Verma"],"pdf_url":"https://arxiv.org/pdf/2209.05379v4.pdf","comment":"This preprint has not undergone peer review (when applicable) or any\n postsubmission improvements or corrections. The Version of Record of this\n article is published in Multimedia Systems (2023), and is available online at\n https://doi.org/10.1007/s00530-023-01132-8"},{"id":"http://arxiv.org/abs/2307.08386v1","updated":"2023-07-17T10:50:09Z","published":"2023-07-17T10:50:09Z","title":"Tabular Machine Learning Methods for Predicting Gas Turbine Emissions","summary":" Predicting emissions for gas turbines is critical for monitoring harmful\npollutants being released into the atmosphere. In this study, we evaluate the\nperformance of machine learning models for predicting emissions for gas\nturbines. We compare an existing predictive emissions model, a first\nprinciples-based Chemical Kinetics model, against two machine learning models\nwe developed based on SAINT and XGBoost, to demonstrate improved predictive\nperformance of nitrogen oxides (NOx) and carbon monoxide (CO) using machine\nlearning techniques. Our analysis utilises a Siemens Energy gas turbine test\nbed tabular dataset to train and validate the machine learning models.\nAdditionally, we explore the trade-off between incorporating more features to\nenhance the model complexity, and the resulting presence of increased missing\nvalues in the dataset.\n","authors":["Rebecca Potts","Rick Hackney","Georgios Leontidis"],"pdf_url":"https://arxiv.org/pdf/2307.08386v1.pdf","comment":"23 pages, 9 figures, 1 appendix"},{"id":"http://arxiv.org/abs/2210.04527v2","updated":"2023-07-17T10:49:02Z","published":"2022-10-10T09:52:02Z","title":"A policy gradient approach for Finite Horizon Constrained Markov\n Decision Processes","summary":" The infinite horizon setting is widely adopted for problems of reinforcement\nlearning (RL). These invariably result in stationary policies that are optimal.\nIn many situations, finite horizon control problems are of interest and for\nsuch problems, the optimal policies are time-varying in general. Another\nsetting that has become popular in recent times is of Constrained Reinforcement\nLearning, where the agent maximizes its rewards while it also aims to satisfy\nsome given constraint criteria. However, this setting has only been studied in\nthe context of infinite horizon MDPs where stationary policies are optimal. We\npresent an algorithm for constrained RL in the Finite Horizon Setting where the\nhorizon terminates after a fixed (finite) time. We use function approximation\nin our algorithm which is essential when the state and action spaces are large\nor continuous and use the policy gradient method to find the optimal policy.\nThe optimal policy that we obtain depends on the stage and so is non-stationary\nin general. To the best of our knowledge, our paper presents the first policy\ngradient algorithm for the finite horizon setting with constraints. We show the\nconvergence of our algorithm to a constrained optimal policy. We also compare\nand analyze the performance of our algorithm through experiments and show that\nour algorithm performs better than some other well known algorithms.\n","authors":["Soumyajit Guin","Shalabh Bhatnagar"],"pdf_url":"https://arxiv.org/pdf/2210.04527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07322v2","updated":"2023-07-17T10:42:45Z","published":"2023-07-14T12:57:51Z","title":"A Context-Aware Cutting Plane Selection Algorithm for Mixed-Integer\n Programming","summary":" The current cut selection algorithm used in mixed-integer programming solvers\nhas remained largely unchanged since its creation. In this paper, we propose a\nset of new cut scoring measures, cut filtering techniques, and stopping\ncriteria, extending the current state-of-the-art algorithm and obtaining a 5\\%\nperformance improvement for SCIP over the MIPLIB 2017 benchmark set.\n","authors":["Mark Turner","Timo Berthold","Mathieu Besançon"],"pdf_url":"https://arxiv.org/pdf/2307.07322v2.pdf","comment":"Added random seeds 4-5 to Table and Figure results"},{"id":"http://arxiv.org/abs/2307.08382v1","updated":"2023-07-17T10:42:21Z","published":"2023-07-17T10:42:21Z","title":"Predicting Battery Lifetime Under Varying Usage Conditions from Early\n Aging Data","summary":" Accurate battery lifetime prediction is important for preventative\nmaintenance, warranties, and improved cell design and manufacturing. However,\nmanufacturing variability and usage-dependent degradation make life prediction\nchallenging. Here, we investigate new features derived from capacity-voltage\ndata in early life to predict the lifetime of cells cycled under widely varying\ncharge rates, discharge rates, and depths of discharge. Features were extracted\nfrom regularly scheduled reference performance tests (i.e., low rate full\ncycles) during cycling. The early-life features capture a cell's state of\nhealth and the rate of change of component-level degradation modes, some of\nwhich correlate strongly with cell lifetime. Using a newly generated dataset\nfrom 225 nickel-manganese-cobalt/graphite Li-ion cells aged under a wide range\nof conditions, we demonstrate a lifetime prediction of in-distribution cells\nwith 15.1% mean absolute percentage error using no more than the first 15% of\ndata, for most cells. Further testing using a hierarchical Bayesian regression\nmodel shows improved performance on extrapolation, achieving 21.8% mean\nabsolute percentage error for out-of-distribution cells. Our approach\nhighlights the importance of using domain knowledge of lithium-ion battery\ndegradation modes to inform feature engineering. Further, we provide the\ncommunity with a new publicly available battery aging dataset with cells cycled\nbeyond 80% of their rated capacity.\n","authors":["Tingkai Li","Zihao Zhou","Adam Thelen","David Howey","Chao Hu"],"pdf_url":"https://arxiv.org/pdf/2307.08382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11533v2","updated":"2023-07-17T10:02:55Z","published":"2022-09-23T11:43:27Z","title":"A Unified Perspective on Natural Gradient Variational Inference with\n Gaussian Mixture Models","summary":" Variational inference with Gaussian mixture models (GMMs) enables learning of\nhighly tractable yet multi-modal approximations of intractable target\ndistributions with up to a few hundred dimensions. The two currently most\neffective methods for GMM-based variational inference, VIPS and iBayes-GMM,\nboth employ independent natural gradient updates for the individual components\nand their weights. We show for the first time, that their derived updates are\nequivalent, although their practical implementations and theoretical guarantees\ndiffer. We identify several design choices that distinguish both approaches,\nnamely with respect to sample selection, natural gradient estimation, stepsize\nadaptation, and whether trust regions are enforced or the number of components\nadapted. We argue that for both approaches, the quality of the learned\napproximations can heavily suffer from the respective design choices: By\nupdating the individual components using samples from the mixture model,\niBayes-GMM often fails to produce meaningful updates to low-weight components,\nand by using a zero-order method for estimating the natural gradient, VIPS\nscales badly to higher-dimensional problems. Furthermore, we show that\ninformation-geometric trust-regions (used by VIPS) are effective even when\nusing first-order natural gradient estimates, and often outperform the improved\nBayesian learning rule (iBLR) update used by iBayes-GMM. We systematically\nevaluate the effects of design choices and show that a hybrid approach\nsignificantly outperforms both prior works. Along with this work, we publish\nour highly modular and efficient implementation for natural gradient\nvariational inference with Gaussian mixture models, which supports 432\ndifferent combinations of design choices, facilitates the reproduction of all\nour experiments, and may prove valuable for the practitioner.\n","authors":["Oleg Arenz","Philipp Dahlinger","Zihan Ye","Michael Volpp","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2209.11533v2.pdf","comment":"This version corresponds to the camera ready version published at\n Transactions of Machine Learning Research (TMLR).\n https://openreview.net/forum?id=tLBjsX4tjs"},{"id":"http://arxiv.org/abs/2307.08364v1","updated":"2023-07-17T10:02:01Z","published":"2023-07-17T10:02:01Z","title":"Q(D)O-ES: Population-based Quality (Diversity) Optimisation for Post Hoc\n Ensemble Selection in AutoML","summary":" Automated machine learning (AutoML) systems commonly ensemble models post hoc\nto improve predictive performance, typically via greedy ensemble selection\n(GES). However, we believe that GES may not always be optimal, as it performs a\nsimple deterministic greedy search. In this work, we introduce two novel\npopulation-based ensemble selection methods, QO-ES and QDO-ES, and compare them\nto GES. While QO-ES optimises solely for predictive performance, QDO-ES also\nconsiders the diversity of ensembles within the population, maintaining a\ndiverse set of well-performing ensembles during optimisation based on ideas of\nquality diversity optimisation. The methods are evaluated using 71\nclassification datasets from the AutoML benchmark, demonstrating that QO-ES and\nQDO-ES often outrank GES, albeit only statistically significant on validation\ndata. Our results further suggest that diversity can be beneficial for post hoc\nensembling but also increases the risk of overfitting.\n","authors":["Lennart Purucker","Lennart Schneider","Marie Anastacio","Joeran Beel","Bernd Bischl","Holger Hoos"],"pdf_url":"https://arxiv.org/pdf/2307.08364v1.pdf","comment":"10 pages main paper, 24 pages references and appendix, 4 figures, 16\n subfigures, 13 tables, to be published in: International Conference on\n Automated Machine Learning 2023. arXiv admin note: text overlap with\n arXiv:2307.00286"},{"id":"http://arxiv.org/abs/2207.10603v2","updated":"2023-07-17T10:00:59Z","published":"2022-07-21T16:59:09Z","title":"Unsupervised pre-training of graph transformers on patient population\n graphs","summary":" Pre-training has shown success in different areas of machine learning, such\nas Computer Vision, Natural Language Processing (NLP), and medical imaging.\nHowever, it has not been fully explored for clinical data analysis. An immense\namount of clinical records are recorded, but still, data and labels can be\nscarce for data collected in small hospitals or dealing with rare diseases. In\nsuch scenarios, pre-training on a larger set of unlabelled clinical data could\nimprove performance. In this paper, we propose novel unsupervised pre-training\ntechniques designed for heterogeneous, multi-modal clinical data for patient\noutcome prediction inspired by masked language modeling (MLM), by leveraging\ngraph deep learning over population graphs. To this end, we further propose a\ngraph-transformer-based network, designed to handle heterogeneous clinical\ndata. By combining masking-based pre-training with a transformer-based network,\nwe translate the success of masking-based pre-training in other domains to\nheterogeneous clinical data. We show the benefit of our pre-training method in\na self-supervised and a transfer learning setting, utilizing three medical\ndatasets TADPOLE, MIMIC-III, and a Sepsis Prediction Dataset. We find that our\nproposed pre-training methods help in modeling the data at a patient and\npopulation level and improve performance in different fine-tuning tasks on all\ndatasets.\n","authors":["Chantal Pellegrini","Nassir Navab","Anees Kazi"],"pdf_url":"https://arxiv.org/pdf/2207.10603v2.pdf","comment":"accepted for publication at the Medical Image Analysis Journal:\n https://www.sciencedirect.com/science/article/abs/pii/S136184152300155X. 20\n pages, 3 figures, 20 tables"},{"id":"http://arxiv.org/abs/2307.08360v1","updated":"2023-07-17T09:55:35Z","published":"2023-07-17T09:55:35Z","title":"Universal Online Learning with Gradual Variations: A Multi-layer Online\n Ensemble Approach","summary":" In this paper, we propose an online convex optimization method with two\ndifferent levels of adaptivity. On a higher level, our method is agnostic to\nthe specific type and curvature of the loss functions, while at a lower level,\nit can exploit the niceness of the environments and attain problem-dependent\nguarantees. To be specific, we obtain $\\mathcal{O}(\\ln V_T)$, $\\mathcal{O}(d\n\\ln V_T)$ and $\\hat{\\mathcal{O}}(\\sqrt{V_T})$ regret bounds for strongly\nconvex, exp-concave and convex loss functions, respectively, where $d$ is the\ndimension, $V_T$ denotes problem-dependent gradient variations and\n$\\hat{\\mathcal{O}}(\\cdot)$-notation omits logarithmic factors on $V_T$. Our\nresult finds broad implications and applications. It not only safeguards the\nworst-case guarantees, but also implies the small-loss bounds in analysis\ndirectly. Besides, it draws deep connections with adversarial/stochastic convex\noptimization and game theory, further validating its practical potential. Our\nmethod is based on a multi-layer online ensemble incorporating novel\ningredients, including carefully-designed optimism for unifying diverse\nfunction types and cascaded corrections for algorithmic stability. Remarkably,\ndespite its multi-layer structure, our algorithm necessitates only one gradient\nquery per round, making it favorable when the gradient evaluation is\ntime-consuming. This is facilitated by a novel regret decomposition equipped\nwith customized surrogate losses.\n","authors":["Yu-Hu Yan","Peng Zhao","Zhi-Hua Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.08360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.12976v3","updated":"2023-07-17T09:55:07Z","published":"2021-08-30T03:32:16Z","title":"Approximating Pandora's Box with Correlations","summary":" We revisit the classic Pandora's Box (PB) problem under correlated\ndistributions on the box values. Recent work of arXiv:1911.01632 obtained\nconstant approximate algorithms for a restricted class of policies for the\nproblem that visit boxes in a fixed order. In this work, we study the\ncomplexity of approximating the optimal policy which may adaptively choose\nwhich box to visit next based on the values seen so far.\n Our main result establishes an approximation-preserving equivalence of PB to\nthe well studied Uniform Decision Tree (UDT) problem from stochastic\noptimization and a variant of the Min-Sum Set Cover ($\\text{MSSC}_f$) problem.\nFor distributions of support $m$, UDT admits a $\\log m$ approximation, and\nwhile a constant factor approximation in polynomial time is a long-standing\nopen problem, constant factor approximations are achievable in subexponential\ntime (arXiv:1906.11385). Our main result implies that the same properties hold\nfor PB and $\\text{MSSC}_f$.\n We also study the case where the distribution over values is given more\nsuccinctly as a mixture of $m$ product distributions. This problem is again\nrelated to a noisy variant of the Optimal Decision Tree which is significantly\nmore challenging. We give a constant-factor approximation that runs in time\n$n^{ \\tilde O( m^2/\\varepsilon^2 ) }$ when the mixture components on every box\nare either identical or separated in TV distance by $\\varepsilon$.\n","authors":["Shuchi Chawla","Evangelia Gergatsouli","Jeremy McMahan","Christos Tzamos"],"pdf_url":"https://arxiv.org/pdf/2108.12976v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08352v1","updated":"2023-07-17T09:43:50Z","published":"2023-07-17T09:43:50Z","title":"Zero-th Order Algorithm for Softmax Attention Optimization","summary":" Large language models (LLMs) have brought about significant transformations\nin human society. Among the crucial computations in LLMs, the softmax unit\nholds great importance. Its helps the model generating a probability\ndistribution on potential subsequent words or phrases, considering a series of\ninput words. By utilizing this distribution, the model selects the most\nprobable next word or phrase, based on the assigned probabilities. The softmax\nunit assumes a vital function in LLM training as it facilitates learning from\ndata through the adjustment of neural network weights and biases.\n With the development of the size of LLMs, computing the gradient becomes\nexpensive. However, Zero-th Order method can approximately compute the gradient\nwith only forward passes. In this paper, we present a Zero-th Order algorithm\nspecifically tailored for Softmax optimization. We demonstrate the convergence\nof our algorithm, highlighting its effectiveness in efficiently computing\ngradients for large-scale LLMs. By leveraging the Zeroth-Order method, our work\ncontributes to the advancement of optimization techniques in the context of\ncomplex language models.\n","authors":["Yichuan Deng","Zhihang Li","Sridhar Mahadevan","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2307.08352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08347v1","updated":"2023-07-17T09:38:41Z","published":"2023-07-17T09:38:41Z","title":"M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models\n and Latent Space Geometry Optimization","summary":" Medical vision-language models enable co-learning and integrating features\nfrom medical imaging and clinical text. However, these models are not easy to\ntrain and the latent representation space can be complex. Here we propose a\nnovel way for pre-training and regularising medical vision-language models. The\nproposed method, named Medical vision-language pre-training with Frozen\nlanguage models and Latent spAce Geometry optimization (M-FLAG), leverages a\nfrozen language model for training stability and efficiency and introduces a\nnovel orthogonality loss to harmonize the latent space geometry. We demonstrate\nthe potential of the pre-trained model on three downstream tasks: medical image\nclassification, segmentation, and object detection. Extensive experiments\nacross five public datasets demonstrate that M-FLAG significantly outperforms\nexisting medical vision-language pre-training approaches and reduces the number\nof parameters by 78\\%. Notably, M-FLAG achieves outstanding performance on the\nsegmentation task while using only 1\\% of the RSNA dataset, even outperforming\nImageNet pre-trained models that have been fine-tuned using 100\\% of the data.\n","authors":["Che Liu","Sibo Cheng","Chen Chen","Mengyun Qiao","Weitong Zhang","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2307.08347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06608v2","updated":"2023-07-17T09:34:34Z","published":"2023-02-13T18:59:52Z","title":"3D-aware Blending with Generative NeRFs","summary":" Image blending aims to combine multiple images seamlessly. It remains\nchallenging for existing 2D-based methods, especially when input images are\nmisaligned due to differences in 3D camera poses and object shapes. To tackle\nthese issues, we propose a 3D-aware blending method using generative Neural\nRadiance Fields (NeRF), including two key components: 3D-aware alignment and\n3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of\nthe reference image with respect to generative NeRFs and then perform 3D local\nalignment for each part. To further leverage 3D information of the generative\nNeRF, we propose 3D-aware blending that directly blends images on the NeRF's\nlatent representation space, rather than raw pixel space. Collectively, our\nmethod outperforms existing 2D baselines, as validated by extensive\nquantitative and qualitative evaluations with FFHQ and AFHQ-Cat.\n","authors":["Hyunsu Kim","Gayoung Lee","Yunjey Choi","Jin-Hwa Kim","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06608v2.pdf","comment":"ICCV 2023, Project page: https://blandocs.github.io/blendnerf"},{"id":"http://arxiv.org/abs/2307.08343v1","updated":"2023-07-17T09:31:26Z","published":"2023-07-17T09:31:26Z","title":"Gaussian processes for Bayesian inverse problems associated with linear\n partial differential equations","summary":" This work is concerned with the use of Gaussian surrogate models for Bayesian\ninverse problems associated with linear partial differential equations. A\nparticular focus is on the regime where only a small amount of training data is\navailable. In this regime the type of Gaussian prior used is of critical\nimportance with respect to how well the surrogate model will perform in terms\nof Bayesian inversion. We extend the framework of Raissi et. al. (2017) to\nconstruct PDE-informed Gaussian priors that we then use to construct different\napproximate posteriors. A number of different numerical experiments illustrate\nthe superiority of the PDE-informed Gaussian priors over more traditional\npriors.\n","authors":["Tianming Bai","Aretha L. Teckentrup","Konstantinos C. Zygalakis"],"pdf_url":"https://arxiv.org/pdf/2307.08343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12086v3","updated":"2023-07-17T09:24:31Z","published":"2022-05-24T14:07:13Z","title":"Information-Directed Selection for Top-Two Algorithms","summary":" We consider the best-k-arm identification problem for multi-armed bandits,\nwhere the objective is to select the exact set of k arms with the highest mean\nrewards by sequentially allocating measurement effort. We characterize the\nnecessary and sufficient conditions for the optimal allocation using dual\nvariables. Remarkably these optimality conditions lead to the extension of\ntop-two algorithm design principle (Russo, 2020), initially proposed for\nbest-arm identification. Furthermore, our optimality conditions induce a simple\nand effective selection rule dubbed information-directed selection (IDS) that\nselects one of the top-two candidates based on a measure of information gain.\nAs a theoretical guarantee, we prove that integrated with IDS, top-two Thompson\nsampling is (asymptotically) optimal for Gaussian best-arm identification,\nsolving a glaring open problem in the pure exploration literature (Russo,\n2020). As a by-product, we show that for k > 1, top-two algorithms cannot\nachieve optimality even when the algorithm has access to the unknown \"optimal\"\ntuning parameter. Numerical experiments show the superior performance of the\nproposed top-two algorithms with IDS and considerable improvement compared with\nalgorithms without adaptive selection.\n","authors":["Wei You","Chao Qin","Zihao Wang","Shuoguang Yang"],"pdf_url":"https://arxiv.org/pdf/2205.12086v3.pdf","comment":"Accepted for presentation at the Conference on Learning Theory (COLT)\n 2023"},{"id":"http://arxiv.org/abs/2307.08336v1","updated":"2023-07-17T09:12:05Z","published":"2023-07-17T09:12:05Z","title":"RAYEN: Imposition of Hard Convex Constraints on Neural Networks","summary":" This paper presents RAYEN, a framework to impose hard convex constraints on\nthe output or latent variable of a neural network. RAYEN guarantees that, for\nany input or any weights of the network, the constraints are satisfied at all\ntimes. Compared to other approaches, RAYEN does not perform a\ncomputationally-expensive orthogonal projection step onto the feasible set,\ndoes not rely on soft constraints (which do not guarantee the satisfaction of\nthe constraints at test time), does not use conservative approximations of the\nfeasible set, and does not perform a potentially slow inner gradient descent\ncorrection to enforce the constraints. RAYEN supports any combination of\nlinear, convex quadratic, second-order cone (SOC), and linear matrix inequality\n(LMI) constraints, achieving a very small computational overhead compared to\nunconstrained networks. For example, it is able to impose 1K quadratic\nconstraints on a 1K-dimensional variable with an overhead of less than 8 ms,\nand an LMI constraint with 300x300 dense matrices on a 10K-dimensional variable\nin less than 12 ms. When used in neural networks that approximate the solution\nof constrained optimization problems, RAYEN achieves computation times between\n20 and 7468 times faster than state-of-the-art algorithms, while guaranteeing\nthe satisfaction of the constraints at all times and obtaining a cost very\nclose to the optimal one.\n","authors":["Jesus Tordesillas","Jonathan P. How","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2307.08336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01947v2","updated":"2023-07-17T09:06:53Z","published":"2023-01-05T08:02:30Z","title":"StitchNet: Composing Neural Networks from Pre-Trained Fragments","summary":" We propose StitchNet, a novel neural network creation paradigm that stitches\ntogether fragments (one or more consecutive network layers) from multiple\npre-trained neural networks. StitchNet allows the creation of high-performing\nneural networks without the large compute and data requirements needed under\ntraditional model creation processes via backpropagation training. We leverage\nCentered Kernel Alignment (CKA) as a compatibility measure to efficiently guide\nthe selection of these fragments in composing a network for a given task\ntailored to specific accuracy needs and computing resource constraints. We then\nshow that these fragments can be stitched together to create neural networks\nwith comparable accuracy to traditionally trained networks at a fraction of\ncomputing resource and data requirements. Finally, we explore a novel\non-the-fly personalized model creation and inference application enabled by\nthis new paradigm.\n","authors":["Surat Teerapittayanon","Marcus Comiter","Brad McDanel","H. T. Kung"],"pdf_url":"https://arxiv.org/pdf/2301.01947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08327v1","updated":"2023-07-17T08:50:36Z","published":"2023-07-17T08:50:36Z","title":"Analyzing the Impact of Adversarial Examples on Explainable Machine\n Learning","summary":" Adversarial attacks are a type of attack on machine learning models where an\nattacker deliberately modifies the inputs to cause the model to make incorrect\npredictions. Adversarial attacks can have serious consequences, particularly in\napplications such as autonomous vehicles, medical diagnosis, and security\nsystems. Work on the vulnerability of deep learning models to adversarial\nattacks has shown that it is very easy to make samples that make a model\npredict things that it doesn't want to. In this work, we analyze the impact of\nmodel interpretability due to adversarial attacks on text classification\nproblems. We develop an ML-based classification model for text data. Then, we\nintroduce the adversarial perturbations on the text data to understand the\nclassification performance after the attack. Subsequently, we analyze and\ninterpret the model's explainability before and after the attack\n","authors":["Prathyusha Devabhakthini","Sasmita Parida","Raj Mani Shukla","Suvendu Chandan Nayak"],"pdf_url":"https://arxiv.org/pdf/2307.08327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08324v1","updated":"2023-07-17T08:42:21Z","published":"2023-07-17T08:42:21Z","title":"A Secure Aggregation for Federated Learning on Long-Tailed Data","summary":" As a distributed learning, Federated Learning (FL) faces two challenges: the\nunbalanced distribution of training data among participants, and the model\nattack by Byzantine nodes. In this paper, we consider the long-tailed\ndistribution with the presence of Byzantine nodes in the FL scenario. A novel\ntwo-layer aggregation method is proposed for the rejection of malicious models\nand the advisable selection of valuable models containing tail class data\ninformation. We introduce the concept of think tank to leverage the wisdom of\nall participants. Preliminary experiments validate that the think tank can make\neffective model selections for global aggregation.\n","authors":["Yanna Jiang","Baihe Ma","Xu Wang","Guangsheng Yu","Caijun Sun","Wei Ni","Ren Ping Liu"],"pdf_url":"https://arxiv.org/pdf/2307.08324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03093v2","updated":"2023-07-17T08:40:34Z","published":"2023-07-06T16:08:47Z","title":"Beyond Intuition, a Framework for Applying GPs to Real-World Data","summary":" Gaussian Processes (GPs) offer an attractive method for regression over\nsmall, structured and correlated datasets. However, their deployment is\nhindered by computational costs and limited guidelines on how to apply GPs\nbeyond simple low-dimensional datasets. We propose a framework to identify the\nsuitability of GPs to a given problem and how to set up a robust and\nwell-specified GP model. The guidelines formalise the decisions of experienced\nGP practitioners, with an emphasis on kernel design and options for\ncomputational scalability. The framework is then applied to a case study of\nglacier elevation change yielding more accurate results at test time.\n","authors":["Kenza Tazi","Jihao Andreas Lin","Ross Viljoen","Alex Gardner","ST John","Hong Ge","Richard E. Turner"],"pdf_url":"https://arxiv.org/pdf/2307.03093v2.pdf","comment":"Accepted at the ICML Workshop on Structured Probabilistic Inference\n and Generative Modelling (2023)"},{"id":"http://arxiv.org/abs/2307.01066v2","updated":"2023-07-17T08:30:38Z","published":"2023-07-03T14:46:49Z","title":"PIGNet2: A Versatile Deep Learning-based Protein-Ligand Interaction\n Prediction Model for Binding Affinity Scoring and Virtual Screening","summary":" Prediction of protein-ligand interactions (PLI) plays a crucial role in drug\ndiscovery as it guides the identification and optimization of molecules that\neffectively bind to target proteins. Despite remarkable advances in deep\nlearning-based PLI prediction, the development of a versatile model capable of\naccurately scoring binding affinity and conducting efficient virtual screening\nremains a challenge. The main obstacle in achieving this lies in the scarcity\nof experimental structure-affinity data, which limits the generalization\nability of existing models. Here, we propose a viable solution to address this\nchallenge by introducing a novel data augmentation strategy combined with a\nphysics-informed graph neural network. The model showed significant\nimprovements in both scoring and screening, outperforming task-specific deep\nlearning models in various tests including derivative benchmarks, and notably\nachieving results comparable to the state-of-the-art performance based on\ndistance likelihood learning. This demonstrates the potential of this approach\nto drug discovery.\n","authors":["Seokhyun Moon","Sang-Yeon Hwang","Jaechang Lim","Woo Youn Kim"],"pdf_url":"https://arxiv.org/pdf/2307.01066v2.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.18465v2","updated":"2023-07-17T08:28:34Z","published":"2023-05-29T07:54:22Z","title":"Federated Learning of Gboard Language Models with Differential Privacy","summary":" We train language models (LMs) with federated learning (FL) and differential\nprivacy (DP) in the Google Keyboard (Gboard). We apply the\nDP-Follow-the-Regularized-Leader (DP-FTRL)~\\citep{kairouz21b} algorithm to\nachieve meaningfully formal DP guarantees without requiring uniform sampling of\nclient devices. To provide favorable privacy-utility trade-offs, we introduce a\nnew client participation criterion and discuss the implication of its\nconfiguration in large scale systems. We show how quantile-based clip\nestimation~\\citep{andrew2019differentially} can be combined with DP-FTRL to\nadaptively choose the clip norm during training or reduce the hyperparameter\ntuning in preparation for training. With the help of pretraining on public\ndata, we train and deploy more than twenty Gboard LMs that achieve high utility\nand $\\rho-$zCDP privacy guarantees with $\\rho \\in (0.2, 2)$, with two models\nadditionally trained with secure aggregation~\\citep{bonawitz2017practical}. We\nare happy to announce that all the next word prediction neural network LMs in\nGboard now have DP guarantees, and all future launches of Gboard neural network\nLMs will require DP guarantees. We summarize our experience and provide\nconcrete suggestions on DP training for practitioners.\n","authors":["Zheng Xu","Yanxiang Zhang","Galen Andrew","Christopher A. Choquette-Choo","Peter Kairouz","H. Brendan McMahan","Jesse Rosenstock","Yuanbo Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.18465v2.pdf","comment":"ACL industry track; v2 updating SecAgg details"},{"id":"http://arxiv.org/abs/2307.08318v1","updated":"2023-07-17T08:26:36Z","published":"2023-07-17T08:26:36Z","title":"Airway Label Prediction in Video Bronchoscopy: Capturing Temporal\n Dependencies Utilizing Anatomical Knowledge","summary":" Purpose: Navigation guidance is a key requirement for a multitude of lung\ninterventions using video bronchoscopy. State-of-the-art solutions focus on\nlung biopsies using electromagnetic tracking and intraoperative image\nregistration w.r.t. preoperative CT scans for guidance. The requirement of\npatient-specific CT scans hampers the utilisation of navigation guidance for\nother applications such as intensive care units.\n Methods: This paper addresses navigation guidance solely incorporating\nbronchosopy video data. In contrast to state-of-the-art approaches we entirely\nomit the use of electromagnetic tracking and patient-specific CT scans.\nGuidance is enabled by means of topological bronchoscope localization w.r.t. an\ninterpatient airway model. Particularly, we take maximally advantage of\nanatomical constraints of airway trees being sequentially traversed. This is\nrealized by incorporating sequences of CNN-based airway likelihoods into a\nHidden Markov Model.\n Results: Our approach is evaluated based on multiple experiments inside a\nlung phantom model. With the consideration of temporal context and use of\nanatomical knowledge for regularization, we are able to improve the accuracy up\nto to 0.98 compared to 0.81 (weighted F1: 0.98 compared to 0.81) for a\nclassification based on individual frames.\n Conclusion: We combine CNN-based single image classification of airway\nsegments with anatomical constraints and temporal HMM-based inference for the\nfirst time. Our approach renders vision-only guidance for bronchoscopy\ninterventions in the absence of electromagnetic tracking and patient-specific\nCT scans possible.\n","authors":["Ron Keuth","Mattias Heinrich","Martin Eichenlaub","Marian Himstedt"],"pdf_url":"https://arxiv.org/pdf/2307.08318v1.pdf","comment":"Submitted to International Journal of Computer Assisted Radiology and\n Surgery"},{"id":"http://arxiv.org/abs/2307.08302v1","updated":"2023-07-17T07:55:21Z","published":"2023-07-17T07:55:21Z","title":"GBT: Two-stage transformer framework for non-stationary time series\n forecasting","summary":" This paper shows that time series forecasting Transformer (TSFT) suffers from\nsevere over-fitting problem caused by improper initialization method of unknown\ndecoder inputs, esp. when handling non-stationary time series. Based on this\nobservation, we propose GBT, a novel two-stage Transformer framework with Good\nBeginning. It decouples the prediction process of TSFT into two stages,\nincluding Auto-Regression stage and Self-Regression stage to tackle the problem\nof different statistical properties between input and prediction\nsequences.Prediction results of Auto-Regression stage serve as a Good\nBeginning, i.e., a better initialization for inputs of Self-Regression stage.\nWe also propose Error Score Modification module to further enhance the\nforecasting capability of the Self-Regression stage in GBT. Extensive\nexperiments on seven benchmark datasets demonstrate that GBT outperforms SOTA\nTSFTs (FEDformer, Pyraformer, ETSformer, etc.) and many other forecasting\nmodels (SCINet, N-HiTS, etc.) with only canonical attention and convolution\nwhile owning less time and space complexity. It is also general enough to\ncouple with these models to strengthen their forecasting capability. The source\ncode is available at: https://github.com/OrigamiSL/GBT\n","authors":["Li Shen","Yuning Wei","Yangzhu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08302v1.pdf","comment":"Accepted by Neural Networks"},{"id":"http://arxiv.org/abs/2303.10840v2","updated":"2023-07-17T07:24:30Z","published":"2023-03-20T03:08:22Z","title":"Ref-NeuS: Ambiguity-Reduced Neural Implicit Surface Learning for\n Multi-View Reconstruction with Reflection","summary":" Neural implicit surface learning has shown significant progress in multi-view\n3D reconstruction, where an object is represented by multilayer perceptrons\nthat provide continuous implicit surface representation and view-dependent\nradiance. However, current methods often fail to accurately reconstruct\nreflective surfaces, leading to severe ambiguity. To overcome this issue, we\npropose Ref-NeuS, which aims to reduce ambiguity by attenuating the effect of\nreflective surfaces. Specifically, we utilize an anomaly detector to estimate\nan explicit reflection score with the guidance of multi-view context to\nlocalize reflective surfaces. Afterward, we design a reflection-aware\nphotometric loss that adaptively reduces ambiguity by modeling rendered color\nas a Gaussian distribution, with the reflection score representing the\nvariance. We show that together with a reflection direction-dependent radiance,\nour model achieves high-quality surface reconstruction on reflective surfaces\nand outperforms the state-of-the-arts by a large margin. Besides, our model is\nalso comparable on general surfaces.\n","authors":["Wenhang Ge","Tao Hu","Haoyu Zhao","Shu Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2303.10840v2.pdf","comment":"ICCV 2023, Project webpage: https://g3956.github.io/"},{"id":"http://arxiv.org/abs/2307.08288v1","updated":"2023-07-17T07:19:17Z","published":"2023-07-17T07:19:17Z","title":"Systematic Testing of the Data-Poisoning Robustness of KNN","summary":" Data poisoning aims to compromise a machine learning based software component\nby contaminating its training set to change its prediction results for test\ninputs. Existing methods for deciding data-poisoning robustness have either\npoor accuracy or long running time and, more importantly, they can only certify\nsome of the truly-robust cases, but remain inconclusive when certification\nfails. In other words, they cannot falsify the truly-non-robust cases. To\novercome this limitation, we propose a systematic testing based method, which\ncan falsify as well as certify data-poisoning robustness for a widely used\nsupervised-learning technique named k-nearest neighbors (KNN). Our method is\nfaster and more accurate than the baseline enumeration method, due to a novel\nover-approximate analysis in the abstract domain, to quickly narrow down the\nsearch space, and systematic testing in the concrete domain, to find the actual\nviolations. We have evaluated our method on a set of supervised-learning\ndatasets. Our results show that the method significantly outperforms\nstate-of-the-art techniques, and can decide data-poisoning robustness of KNN\nprediction results for most of the test inputs.\n","authors":["Yannan Li","Jingbo Wang","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08286v1","updated":"2023-07-17T07:16:28Z","published":"2023-07-17T07:16:28Z","title":"Going Beyond Linear Mode Connectivity: The Layerwise Linear Feature\n Connectivity","summary":" Recent work has revealed many intriguing empirical phenomena in neural\nnetwork training, despite the poorly understood and highly complex loss\nlandscapes and training dynamics. One of these phenomena, Linear Mode\nConnectivity (LMC), has gained considerable attention due to the intriguing\nobservation that different solutions can be connected by a linear path in the\nparameter space while maintaining near-constant training and test losses. In\nthis work, we introduce a stronger notion of linear connectivity, Layerwise\nLinear Feature Connectivity (LLFC), which says that the feature maps of every\nlayer in different trained networks are also linearly connected. We provide\ncomprehensive empirical evidence for LLFC across a wide range of settings,\ndemonstrating that whenever two trained networks satisfy LMC (via either\nspawning or permutation methods), they also satisfy LLFC in nearly all the\nlayers. Furthermore, we delve deeper into the underlying factors contributing\nto LLFC, which reveal new insights into the spawning and permutation\napproaches. The study of LLFC transcends and advances our understanding of LMC\nby adopting a feature-learning perspective.\n","authors":["Zhanpeng Zhou","Yongyi Yang","Xiaojiang Yang","Junchi Yan","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2307.08286v1.pdf","comment":"25 pages, 23 figures"},{"id":"http://arxiv.org/abs/2307.08283v1","updated":"2023-07-17T07:12:29Z","published":"2023-07-17T07:12:29Z","title":"Complexity Matters: Rethinking the Latent Space for Generative Modeling","summary":" In generative modeling, numerous successful approaches leverage a\nlow-dimensional latent space, e.g., Stable Diffusion models the latent space\ninduced by an encoder and generates images through a paired decoder. Although\nthe selection of the latent space is empirically pivotal, determining the\noptimal choice and the process of identifying it remain unclear. In this study,\nwe aim to shed light on this under-explored topic by rethinking the latent\nspace from the perspective of model complexity. Our investigation starts with\nthe classic generative adversarial networks (GANs). Inspired by the GAN\ntraining objective, we propose a novel \"distance\" between the latent and data\ndistributions, whose minimization coincides with that of the generator\ncomplexity. The minimizer of this distance is characterized as the optimal\ndata-dependent latent that most effectively capitalizes on the generator's\ncapacity. Then, we consider parameterizing such a latent distribution by an\nencoder network and propose a two-stage training strategy called Decoupled\nAutoencoder (DAE), where the encoder is only updated in the first stage with an\nauxiliary decoder and then frozen in the second stage while the actual decoder\nis being trained. DAE can improve the latent distribution and as a result,\nimprove the generative performance. Our theoretical analyses are corroborated\nby comprehensive experiments on various models such as VQGAN and Diffusion\nTransformer, where our modifications yield significant improvements in sample\nquality with decreased model complexity.\n","authors":["Tianyang Hu","Fei Chen","Haonan Wang","Jiawei Li","Wenjia Wang","Jiacheng Sun","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2307.08283v1.pdf","comment":"TL;DR: This work characterizes the optimal latent distribution for\n generative models from the perspective of minimizing model complexity and\n proposes a two-stage training scheme that achieves practical improvements on\n GAN, VQGAN and DiT"},{"id":"http://arxiv.org/abs/2205.01970v6","updated":"2023-07-17T07:02:03Z","published":"2022-05-04T09:37:16Z","title":"Non-Stationary Bandit Learning via Predictive Sampling","summary":" Thompson sampling has proven effective across a wide range of stationary\nbandit environments. However, as we demonstrate in this paper, it can perform\npoorly when applied to non-stationary environments. We attribute such failures\nto the fact that, when exploring, the algorithm does not differentiate actions\nbased on how quickly the information acquired loses its usefulness due to\nnon-stationarity. Building upon this insight, we propose predictive sampling,\nan algorithm that deprioritizes acquiring information that quickly loses\nusefulness. A theoretical guarantee on the performance of predictive sampling\nis established through a Bayesian regret bound. We provide versions of\npredictive sampling for which computations tractably scale to complex bandit\nenvironments of practical interest. Through numerical simulations, we\ndemonstrate that predictive sampling outperforms Thompson sampling in all\nnon-stationary environments examined.\n","authors":["Yueyang Liu","Xu Kuang","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2205.01970v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08278v1","updated":"2023-07-17T06:58:22Z","published":"2023-07-17T06:58:22Z","title":"Adversarial Attacks on Traffic Sign Recognition: A Survey","summary":" Traffic sign recognition is an essential component of perception in\nautonomous vehicles, which is currently performed almost exclusively with deep\nneural networks (DNNs). However, DNNs are known to be vulnerable to adversarial\nattacks. Several previous works have demonstrated the feasibility of\nadversarial attacks on traffic sign recognition models. Traffic signs are\nparticularly promising for adversarial attack research due to the ease of\nperforming real-world attacks using printed signs or stickers. In this work, we\nsurvey existing works performing either digital or real-world attacks on\ntraffic sign detection and classification models. We provide an overview of the\nlatest advancements and highlight the existing research areas that require\nfurther investigation.\n","authors":["Svetlana Pavlitska","Nico Lambing","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2307.08278v1.pdf","comment":"Accepted for publication at ICECCME2023"},{"id":"http://arxiv.org/abs/2305.12695v2","updated":"2023-07-17T06:38:00Z","published":"2023-05-22T04:07:36Z","title":"Systematic Literature Review on Application of Machine Learning in\n Continuous Integration","summary":" This research conducted a systematic review of the literature on machine\nlearning (ML)-based methods in the context of Continuous Integration (CI) over\nthe past 22 years. The study aimed to identify and describe the techniques used\nin ML-based solutions for CI and analyzed various aspects such as data\nengineering, feature engineering, hyper-parameter tuning, ML models, evaluation\nmethods, and metrics. In this paper, we have depicted the phases of CI testing,\nthe connection between them, and the employed techniques in training the ML\nmethod phases. We presented nine types of data sources and four taken steps in\nthe selected studies for preparing the data. Also, we identified four feature\ntypes and nine subsets of data features through thematic analysis of the\nselected studies. Besides, five methods for selecting and tuning the\nhyper-parameters are shown. In addition, we summarised the evaluation methods\nused in the literature and identified fifteen different metrics. The most\ncommonly used evaluation methods were found to be precision, recall, and\nF1-score, and we have also identified five methods for evaluating the\nperformance of trained ML models. Finally, we have presented the relationship\nbetween ML model types, performance measurements, and CI phases. The study\nprovides valuable insights for researchers and practitioners interested in\nML-based methods in CI and emphasizes the need for further research in this\narea.\n","authors":["Ali Kazemi Arani","Triet Huynh Minh Le","Mansooreh Zahedi","Muhammad Ali Babar"],"pdf_url":"https://arxiv.org/pdf/2305.12695v2.pdf","comment":"This paper got a rejection and we need to address the comments and\n upload the new version with new results"},{"id":"http://arxiv.org/abs/2207.06767v2","updated":"2023-07-17T06:11:59Z","published":"2022-07-14T09:24:55Z","title":"Semi-supervised cross-lingual speech emotion recognition","summary":" Performance in Speech Emotion Recognition (SER) on a single language has\nincreased greatly in the last few years thanks to the use of deep learning\ntechniques. However, cross-lingual SER remains a challenge in real-world\napplications due to two main factors: the first is the big gap among the source\nand the target domain distributions; the second factor is the major\navailability of unlabeled utterances in contrast to the labeled ones for the\nnew language. Taking into account previous aspects, we propose a\nSemi-Supervised Learning (SSL) method for cross-lingual emotion recognition\nwhen only few labeled examples in the target domain (i.e. the new language) are\navailable. Our method is based on a Transformer and it adapts to the new domain\nby exploiting a pseudo-labeling strategy on the unlabeled utterances. In\nparticular, the use of a hard and soft pseudo-labels approach is investigated.\nWe thoroughly evaluate the performance of the proposed method in a\nspeaker-independent setup on both the source and the new language and show its\nrobustness across five languages belonging to different linguistic strains. The\nexperimental findings indicate that the unweighted accuracy is increased by an\naverage of 40% compared to state-of-the-art methods.\n","authors":["Mirko Agarla","Simone Bianco","Luigi Celona","Paolo Napoletano","Alexey Petrovsky","Flavio Piccoli","Raimondo Schettini","Ivan Shanin"],"pdf_url":"https://arxiv.org/pdf/2207.06767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03323v3","updated":"2023-07-17T06:03:16Z","published":"2023-03-06T17:48:32Z","title":"CleanCLIP: Mitigating Data Poisoning Attacks in Multimodal Contrastive\n Learning","summary":" Multimodal contrastive pretraining has been used to train multimodal\nrepresentation models, such as CLIP, on large amounts of paired image-text\ndata. However, previous studies have revealed that such models are vulnerable\nto backdoor attacks. Specifically, when trained on backdoored examples, CLIP\nlearns spurious correlations between the embedded backdoor trigger and the\ntarget label, aligning their representations in the joint embedding space.\nInjecting even a small number of poisoned examples, such as 75 examples in 3\nmillion pretraining data, can significantly manipulate the model's behavior,\nmaking it difficult to detect or unlearn such correlations. To address this\nissue, we propose CleanCLIP, a finetuning framework that weakens the learned\nspurious associations introduced by backdoor attacks by independently\nre-aligning the representations for individual modalities. We demonstrate that\nunsupervised finetuning using a combination of multimodal contrastive and\nunimodal self-supervised objectives for individual modalities can significantly\nreduce the impact of the backdoor attack. Additionally, we show that supervised\nfinetuning on task-specific labeled image data removes the backdoor trigger\nfrom the CLIP vision encoder. We show empirically that CleanCLIP maintains\nmodel performance on benign examples while erasing a range of backdoor attacks\non multimodal contrastive learning. The code and checkpoints are available at\nhttps://github.com/nishadsinghi/CleanCLIP.\n","authors":["Hritik Bansal","Nishad Singhi","Yu Yang","Fan Yin","Aditya Grover","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2303.03323v3.pdf","comment":"22 pages. Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.04368v2","updated":"2023-07-17T05:34:47Z","published":"2023-07-10T06:49:18Z","title":"ECS -- an Interactive Tool for Data Quality Assurance","summary":" With the increasing capabilities of machine learning systems and their\npotential use in safety-critical systems, ensuring high-quality data is\nbecoming increasingly important. In this paper we present a novel approach for\nthe assurance of data quality. For this purpose, the mathematical basics are\nfirst discussed and the approach is presented using multiple examples. This\nresults in the detection of data points with potentially harmful properties for\nthe use in safety-critical systems.\n","authors":["Christian Sieberichs","Simon Geerkens","Alexander Braun","Thomas Waschulzik"],"pdf_url":"https://arxiv.org/pdf/2307.04368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09862v2","updated":"2023-07-17T05:18:21Z","published":"2023-06-16T14:18:32Z","title":"DoubleAdapt: A Meta-learning Approach to Incremental Learning for Stock\n Trend Forecasting","summary":" Stock trend forecasting is a fundamental task of quantitative investment\nwhere precise predictions of price trends are indispensable. As an online\nservice, stock data continuously arrive over time. It is practical and\nefficient to incrementally update the forecast model with the latest data which\nmay reveal some new patterns recurring in the future stock market. However,\nincremental learning for stock trend forecasting still remains under-explored\ndue to the challenge of distribution shifts (a.k.a. concept drifts). With the\nstock market dynamically evolving, the distribution of future data can slightly\nor significantly differ from incremental data, hindering the effectiveness of\nincremental updates. To address this challenge, we propose DoubleAdapt, an\nend-to-end framework with two adapters, which can effectively adapt the data\nand the model to mitigate the effects of distribution shifts. Our key insight\nis to automatically learn how to adapt stock data into a locally stationary\ndistribution in favor of profitable updates. Complemented by data adaptation,\nwe can confidently adapt the model parameters under mitigated distribution\nshifts. We cast each incremental learning task as a meta-learning task and\nautomatically optimize the adapters for desirable data adaptation and parameter\ninitialization. Experiments on real-world stock datasets demonstrate that\nDoubleAdapt achieves state-of-the-art predictive performance and shows\nconsiderable efficiency.\n","authors":["Lifan Zhao","Shuming Kong","Yanyan Shen"],"pdf_url":"https://arxiv.org/pdf/2306.09862v2.pdf","comment":"Accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2005.11018v3","updated":"2023-07-17T05:15:23Z","published":"2020-05-22T06:05:00Z","title":"Semi-Supervised Learning: the Case When Unlabeled Data is Equally Useful","summary":" Semi-supervised learning algorithms attempt to take advantage of relatively\ninexpensive unlabeled data to improve learning performance. In this work, we\nconsider statistical models where the data distributions can be characterized\nby continuous parameters. We show that under certain conditions on the\ndistribution, unlabeled data is equally useful as labeled date in terms of\nlearning rate. Specifically, let $n, m$ be the number of labeled and unlabeled\ndata, respectively. It is shown that the learning rate of semi-supervised\nlearning scales as $O(1/n)$ if $m\\sim n$, and scales as $O(1/n^{1+\\gamma})$ if\n$m\\sim n^{1+\\gamma}$ for some $\\gamma>0$, whereas the learning rate of\nsupervised learning scales as $O(1/n)$.\n","authors":["Jingge Zhu"],"pdf_url":"https://arxiv.org/pdf/2005.11018v3.pdf","comment":"Published in UAI 2020. This version: an error in Lemma 2 is corrected"},{"id":"http://arxiv.org/abs/2307.08245v1","updated":"2023-07-17T05:03:53Z","published":"2023-07-17T05:03:53Z","title":"Convex Bi-Level Optimization Problems with Non-smooth Outer Objective\n Function","summary":" In this paper, we propose the Bi-Sub-Gradient (Bi-SG) method, which is a\ngeneralization of the classical sub-gradient method to the setting of convex\nbi-level optimization problems. This is a first-order method that is very easy\nto implement in the sense that it requires only a computation of the associated\nproximal mapping or a sub-gradient of the outer non-smooth objective function,\nin addition to a proximal gradient step on the inner optimization problem. We\nshow, under very mild assumptions, that Bi-SG tackles bi-level optimization\nproblems and achieves sub-linear rates both in terms of the inner and outer\nobjective functions. Moreover, if the outer objective function is additionally\nstrongly convex (still could be non-smooth), the outer rate can be improved to\na linear rate. Last, we prove that the distance of the generated sequence to\nthe set of optimal solutions of the bi-level problem converges to zero.\n","authors":["Roey Merchav","Shoham Sabach"],"pdf_url":"https://arxiv.org/pdf/2307.08245v1.pdf","comment":"Accepted for publication In SIAM journal on Optimization"},{"id":"http://arxiv.org/abs/2307.08237v1","updated":"2023-07-17T04:38:51Z","published":"2023-07-17T04:38:51Z","title":"A Look into Causal Effects under Entangled Treatment in Graphs:\n Investigating the Impact of Contact on MRSA Infection","summary":" Methicillin-resistant Staphylococcus aureus (MRSA) is a type of bacteria\nresistant to certain antibiotics, making it difficult to prevent MRSA\ninfections. Among decades of efforts to conquer infectious diseases caused by\nMRSA, many studies have been proposed to estimate the causal effects of close\ncontact (treatment) on MRSA infection (outcome) from observational data. In\nthis problem, the treatment assignment mechanism plays a key role as it\ndetermines the patterns of missing counterfactuals -- the fundamental challenge\nof causal effect estimation. Most existing observational studies for causal\neffect learning assume that the treatment is assigned individually for each\nunit. However, on many occasions, the treatments are pairwisely assigned for\nunits that are connected in graphs, i.e., the treatments of different units are\nentangled. Neglecting the entangled treatments can impede the causal effect\nestimation. In this paper, we study the problem of causal effect estimation\nwith treatment entangled in a graph. Despite a few explorations for entangled\ntreatments, this problem still remains challenging due to the following\nchallenges: (1) the entanglement brings difficulties in modeling and leveraging\nthe unknown treatment assignment mechanism; (2) there may exist hidden\nconfounders which lead to confounding biases in causal effect estimation; (3)\nthe observational data is often time-varying. To tackle these challenges, we\npropose a novel method NEAT, which explicitly leverages the graph structure to\nmodel the treatment assignment mechanism, and mitigates confounding biases\nbased on the treatment assignment modeling. We also extend our method into a\ndynamic setting to handle time-varying observational data. Experiments on both\nsynthetic datasets and a real-world MRSA dataset validate the effectiveness of\nthe proposed method, and provide insights for future applications.\n","authors":["Jing Ma","Chen Chen","Anil Vullikanti","Ritwick Mishra","Gregory Madden","Daniel Borrajo","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2307.08237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08235v1","updated":"2023-07-17T04:32:45Z","published":"2023-07-17T04:32:45Z","title":"HeroLT: Benchmarking Heterogeneous Long-Tailed Learning","summary":" Long-tailed data distributions are prevalent in a variety of domains,\nincluding finance, e-commerce, biomedical science, and cyber security. In such\nscenarios, the performance of machine learning models is often dominated by the\nhead categories, while the learning of tail categories is significantly\ninadequate. Given abundant studies conducted to alleviate the issue, this work\naims to provide a systematic view of long-tailed learning with regard to three\npivotal angles: (A1) the characterization of data long-tailedness, (A2) the\ndata complexity of various domains, and (A3) the heterogeneity of emerging\ntasks. To achieve this, we develop the most comprehensive (to the best of our\nknowledge) long-tailed learning benchmark named HeroLT, which integrates 13\nstate-of-the-art algorithms and 6 evaluation metrics on 14 real-world benchmark\ndatasets across 4 tasks from 3 domains. HeroLT with novel angles and extensive\nexperiments (264 in total) enables researchers and practitioners to effectively\nand fairly evaluate newly proposed methods compared with existing baselines on\nvarying types of datasets. Finally, we conclude by highlighting the significant\napplications of long-tailed learning and identifying several promising future\ndirections. For accessibility and reproducibility, we open-source our benchmark\nHeroLT and corresponding results at https://github.com/SSSKJ/HeroLT.\n","authors":["Haohui Wang","Weijie Guan","Jianpeng Chen","Zi Wang","Dawei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.08235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08232v1","updated":"2023-07-17T04:08:29Z","published":"2023-07-17T04:08:29Z","title":"Learning for Counterfactual Fairness from Observational Data","summary":" Fairness-aware machine learning has attracted a surge of attention in many\ndomains, such as online advertising, personalized recommendation, and social\nmedia analysis in web applications. Fairness-aware machine learning aims to\neliminate biases of learning models against certain subgroups described by\ncertain protected (sensitive) attributes such as race, gender, and age. Among\nmany existing fairness notions, counterfactual fairness is a popular notion\ndefined from a causal perspective. It measures the fairness of a predictor by\ncomparing the prediction of each individual in the original world and that in\nthe counterfactual worlds in which the value of the sensitive attribute is\nmodified. A prerequisite for existing methods to achieve counterfactual\nfairness is the prior human knowledge of the causal model for the data.\nHowever, in real-world scenarios, the underlying causal model is often unknown,\nand acquiring such human knowledge could be very difficult. In these scenarios,\nit is risky to directly trust the causal models obtained from information\nsources with unknown reliability and even causal discovery methods, as\nincorrect causal models can consequently bring biases to the predictor and lead\nto unfair predictions. In this work, we address the problem of counterfactually\nfair prediction from observational data without given causal models by\nproposing a novel framework CLAIRE. Specifically, under certain general\nassumptions, CLAIRE effectively mitigates the biases from the sensitive\nattribute with a representation learning framework based on counterfactual data\naugmentation and an invariant penalty. Experiments conducted on both synthetic\nand real-world datasets validate the superiority of CLAIRE in both\ncounterfactual fairness and prediction performance.\n","authors":["Jing Ma","Ruocheng Guo","Aidong Zhang","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2307.08232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08226v1","updated":"2023-07-17T04:01:48Z","published":"2023-07-17T04:01:48Z","title":"Can Euclidean Symmetry be Leveraged in Reinforcement Learning and\n Planning?","summary":" In robotic tasks, changes in reference frames typically do not influence the\nunderlying physical properties of the system, which has been known as\ninvariance of physical laws.These changes, which preserve distance, encompass\nisometric transformations such as translations, rotations, and reflections,\ncollectively known as the Euclidean group. In this work, we delve into the\ndesign of improved learning algorithms for reinforcement learning and planning\ntasks that possess Euclidean group symmetry. We put forth a theory on that\nunify prior work on discrete and continuous symmetry in reinforcement learning,\nplanning, and optimal control. Algorithm side, we further extend the 2D path\nplanning with value-based planning to continuous MDPs and propose a pipeline\nfor constructing equivariant sampling-based planning algorithms. Our work is\nsubstantiated with empirical evidence and illustrated through examples that\nexplain the benefits of equivariance to Euclidean symmetry in tackling natural\ncontrol problems.\n","authors":["Linfeng Zhao","Owen Howell","Jung Yeon Park","Xupeng Zhu","Robin Walters","Lawson L. S. Wong"],"pdf_url":"https://arxiv.org/pdf/2307.08226v1.pdf","comment":"Preprint. Website: http://lfzhao.com/SymCtrl"},{"id":"http://arxiv.org/abs/2210.13601v3","updated":"2023-07-17T03:55:03Z","published":"2022-10-24T20:55:21Z","title":"Active Learning for Single Neuron Models with Lipschitz Non-Linearities","summary":" We consider the problem of active learning for single neuron models, also\nsometimes called ``ridge functions'', in the agnostic setting (under\nadversarial label noise). Such models have been shown to be broadly effective\nin modeling physical phenomena, and for constructing surrogate data-driven\nmodels for partial differential equations.\n Surprisingly, we show that for a single neuron model with any Lipschitz\nnon-linearity (such as the ReLU, sigmoid, absolute value, low-degree\npolynomial, among others), strong provable approximation guarantees can be\nobtained using a well-known active learning strategy for fitting \\emph{linear\nfunctions} in the agnostic setting. % -- i.e. for the case when there is no\nnon-linearity. Namely, we can collect samples via statistical \\emph{leverage\nscore sampling}, which has been shown to be near-optimal in other active\nlearning scenarios. We support our theoretical results with empirical\nsimulations showing that our proposed active learning strategy based on\nleverage score sampling outperforms (ordinary) uniform sampling when fitting\nsingle neuron models.\n","authors":["Aarshvi Gajjar","Chinmay Hegde","Christopher Musco"],"pdf_url":"https://arxiv.org/pdf/2210.13601v3.pdf","comment":"Inadvertently submitting an incorrect writeup that does not align\n with the intended content"},{"id":"http://arxiv.org/abs/2307.08220v1","updated":"2023-07-17T03:45:00Z","published":"2023-07-17T03:45:00Z","title":"A Lightweight Framework for High-Quality Code Generation","summary":" In recent years, the use of automated source code generation utilizing\ntransformer-based generative models has expanded, and these models can generate\nfunctional code according to the requirements of the developers. However,\nrecent research revealed that these automatically generated source codes can\ncontain vulnerabilities and other quality issues. Despite researchers' and\npractitioners' attempts to enhance code generation models, retraining and\nfine-tuning large language models is time-consuming and resource-intensive.\nThus, we describe FRANC, a lightweight framework for recommending more secure\nand high-quality source code derived from transformer-based code generation\nmodels. FRANC includes a static filter to make the generated code compilable\nwith heuristics and a quality-aware ranker to sort the code snippets based on a\nquality score. Moreover, the framework uses prompt engineering to fix\npersistent quality issues. We evaluated the framework with five Python and Java\ncode generation models and six prompt datasets, including a newly created one\nin this work (SOEval). The static filter improves 9% to 46% Java suggestions\nand 10% to 43% Python suggestions regarding compilability. The average\nimprovement over the NDCG@10 score for the ranking system is 0.0763, and the\nrepairing techniques repair the highest 80% of prompts. FRANC takes, on\naverage, 1.98 seconds for Java; for Python, it takes 0.08 seconds.\n","authors":["Mohammed Latif Siddiq","Beatrice Casey","Joanna C. S. Santos"],"pdf_url":"https://arxiv.org/pdf/2307.08220v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2303.07679v2","updated":"2023-07-17T03:41:26Z","published":"2023-03-14T07:42:02Z","title":"Feature representations useful for predicting image memorability","summary":" Prediction of image memorability has attracted interest in various fields.\nConsequently, the prediction accuracy of convolutional neural network (CNN)\nmodels has been approaching the empirical upper bound estimated based on human\nconsistency. However, identifying which feature representations embedded in CNN\nmodels are responsible for the high memorability prediction accuracy remains an\nopen question. To tackle this problem, we sought to identify\nmemorability-related feature representations in CNN models using brain\nsimilarity. Specifically, memorability prediction accuracy and brain similarity\nwere examined across 16,860 layers in 64 CNN models pretrained for object\nrecognition. A clear tendency was observed in this comprehensive analysis that\nlayers with high memorability prediction accuracy had higher brain similarity\nwith the inferior temporal (IT) cortex, which is the highest stage in the\nventral visual pathway. Furthermore, fine-tuning of the 64 CNN models for\nmemorability prediction revealed that brain similarity with the IT cortex at\nthe penultimate layer positively correlated with the memorability prediction\naccuracy of the models. This analysis also showed that the best fine-tuned\nmodel provided accuracy comparable to state-of-the-art CNN models developed for\nmemorability prediction. Overall, the results of this study indicated that the\nCNN models' great success in predicting memorability relies on feature\nrepresentation acquisition, similar to the IT cortex. This study advances our\nunderstanding of feature representations and their use in predicting image\nmemorability.\n","authors":["Takumi Harada","Hiroyuki Sakai"],"pdf_url":"https://arxiv.org/pdf/2303.07679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04471v2","updated":"2023-07-17T03:28:17Z","published":"2022-12-08T18:46:06Z","title":"Learning Quantum Processes and Hamiltonians via the Pauli Transfer\n Matrix","summary":" Learning about physical systems from quantum-enhanced experiments, relying on\na quantum memory and quantum processing, can outperform learning from\nexperiments in which only classical memory and processing are available.\nWhereas quantum advantages have been established for a variety of state\nlearning tasks, quantum process learning allows for comparable advantages only\nwith a careful problem formulation and is less understood. We establish an\nexponential quantum advantage for learning an unknown $n$-qubit quantum process\n$\\mathcal{N}$. We show that a quantum memory allows to efficiently solve the\nfollowing tasks: (a) learning the Pauli transfer matrix of an arbitrary\n$\\mathcal{N}$, (b) predicting expectation values of bounded Pauli-sparse\nobservables measured on the output of an arbitrary $\\mathcal{N}$ upon input of\na Pauli-sparse state, and (c) predicting expectation values of arbitrary\nbounded observables measured on the output of an unknown $\\mathcal{N}$ with\nsparse Pauli transfer matrix upon input of an arbitrary state. With quantum\nmemory, these tasks can be solved using linearly-in-$n$ many copies of the Choi\nstate of $\\mathcal{N}$, and even time-efficiently in the case of (b). In\ncontrast, any learner without quantum memory requires exponentially-in-$n$ many\nqueries, even when querying $\\mathcal{N}$ on subsystems of adaptively chosen\nstates and performing adaptively chosen measurements. In proving this\nseparation, we extend existing shadow tomography upper and lower bounds from\nstates to channels via the Choi-Jamiolkowski isomorphism. Moreover, we combine\nPauli transfer matrix learning with polynomial interpolation techniques to\ndevelop a procedure for learning arbitrary Hamiltonians, which may have\nnon-local all-to-all interactions, from short-time dynamics. Our results\nhighlight the power of quantum-enhanced experiments for learning highly complex\nquantum dynamics.\n","authors":["Matthias C. Caro"],"pdf_url":"https://arxiv.org/pdf/2212.04471v2.pdf","comment":"30+31 pages, 2+1 figures; V2 includes small corrections to Remark 4.3\n and Lemma 6.1 as well improvements to the presentation"},{"id":"http://arxiv.org/abs/2307.08214v1","updated":"2023-07-17T03:14:32Z","published":"2023-07-17T03:14:32Z","title":"Forward Laplacian: A New Computational Framework for Neural\n Network-based Variational Monte Carlo","summary":" Neural network-based variational Monte Carlo (NN-VMC) has emerged as a\npromising cutting-edge technique of ab initio quantum chemistry. However, the\nhigh computational cost of existing approaches hinders their applications in\nrealistic chemistry problems. Here, we report the development of a new NN-VMC\nmethod that achieves a remarkable speed-up by more than one order of magnitude,\nthereby greatly extending the applicability of NN-VMC to larger systems. Our\nkey design is a novel computational framework named Forward Laplacian, which\ncomputes the Laplacian associated with neural networks, the bottleneck of\nNN-VMC, through an efficient forward propagation process. We then demonstrate\nthat Forward Laplacian is not only versatile but also facilitates more\ndevelopments of acceleration methods across various aspects, including\noptimization for sparse derivative matrix and efficient neural network design.\nEmpirically, our approach enables NN-VMC to investigate a broader range of\natoms, molecules and chemical reactions for the first time, providing valuable\nreferences to other ab initio methods. The results demonstrate a great\npotential in applying deep learning methods to solve general quantum mechanical\nproblems.\n","authors":["Ruichen Li","Haotian Ye","Du Jiang","Xuelan Wen","Chuwei Wang","Zhe Li","Xiang Li","Di He","Ji Chen","Weiluo Ren","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08208v1","updated":"2023-07-17T02:58:25Z","published":"2023-07-17T02:58:25Z","title":"Towards Stealthy Backdoor Attacks against Speech Recognition via\n Elements of Sound","summary":" Deep neural networks (DNNs) have been widely and successfully adopted and\ndeployed in various applications of speech recognition. Recently, a few works\nrevealed that these models are vulnerable to backdoor attacks, where the\nadversaries can implant malicious prediction behaviors into victim models by\npoisoning their training process. In this paper, we revisit poison-only\nbackdoor attacks against speech recognition. We reveal that existing methods\nare not stealthy since their trigger patterns are perceptible to humans or\nmachine detection. This limitation is mostly because their trigger patterns are\nsimple noises or separable and distinctive clips. Motivated by these findings,\nwe propose to exploit elements of sound ($e.g.$, pitch and timbre) to design\nmore stealthy yet effective poison-only backdoor attacks. Specifically, we\ninsert a short-duration high-pitched signal as the trigger and increase the\npitch of remaining audio clips to `mask' it for designing stealthy pitch-based\ntriggers. We manipulate timbre features of victim audios to design the stealthy\ntimbre-based attack and design a voiceprint selection module to facilitate the\nmulti-backdoor attack. Our attacks can generate more `natural' poisoned samples\nand therefore are more stealthy. Extensive experiments are conducted on\nbenchmark datasets, which verify the effectiveness of our attacks under\ndifferent settings ($e.g.$, all-to-one, all-to-all, clean-label, physical, and\nmulti-backdoor settings) and their stealthiness. The code for reproducing main\nexperiments are available at \\url{https://github.com/HanboCai/BadSpeech_SoE}.\n","authors":["Hanbo Cai","Pengcheng Zhang","Hai Dong","Yan Xiao","Stefanos Koffas","Yiming Li"],"pdf_url":"https://arxiv.org/pdf/2307.08208v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2306.01007v2","updated":"2023-07-17T02:57:27Z","published":"2023-05-31T19:04:16Z","title":"Towards Fair Disentangled Online Learning for Changing Environments","summary":" In the problem of online learning for changing environments, data are\nsequentially received one after another over time, and their distribution\nassumptions may vary frequently. Although existing methods demonstrate the\neffectiveness of their learning algorithms by providing a tight bound on either\ndynamic regret or adaptive regret, most of them completely ignore learning with\nmodel fairness, defined as the statistical parity across different\nsub-population (e.g., race and gender). Another drawback is that when adapting\nto a new environment, an online learner needs to update model parameters with a\nglobal change, which is costly and inefficient. Inspired by the sparse\nmechanism shift hypothesis, we claim that changing environments in online\nlearning can be attributed to partial changes in learned parameters that are\nspecific to environments and the rest remain invariant to changing\nenvironments. To this end, in this paper, we propose a novel algorithm under\nthe assumption that data collected at each time can be disentangled with two\nrepresentations, an environment-invariant semantic factor and an\nenvironment-specific variation factor. The semantic factor is further used for\nfair prediction under a group fairness constraint. To evaluate the sequence of\nmodel parameters generated by the learner, a novel regret is proposed in which\nit takes a mixed form of dynamic and static regret metrics followed by a\nfairness-aware long-term constraint. The detailed analysis provides theoretical\nguarantees for loss regret and violation of cumulative fairness constraints.\nEmpirical evaluations on real-world datasets demonstrate our proposed method\nsequentially outperforms baseline methods in model accuracy and fairness.\n","authors":["Chen Zhao","Feng Mi","Xintao Wu","Kai Jiang","Latifur Khan","Christan Grant","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2306.01007v2.pdf","comment":"Accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2307.08204v1","updated":"2023-07-17T02:38:04Z","published":"2023-07-17T02:38:04Z","title":"A Quantum Convolutional Neural Network Approach for Object Detection and\n Classification","summary":" This paper presents a comprehensive evaluation of the potential of Quantum\nConvolutional Neural Networks (QCNNs) in comparison to classical Convolutional\nNeural Networks (CNNs) and Artificial / Classical Neural Network (ANN) models.\nWith the increasing amount of data, utilizing computing methods like CNN in\nreal-time has become challenging. QCNNs overcome this challenge by utilizing\nqubits to represent data in a quantum environment and applying CNN structures\nto quantum computers. The time and accuracy of QCNNs are compared with\nclassical CNNs and ANN models under different conditions such as batch size and\ninput size. The maximum complexity level that QCNNs can handle in terms of\nthese parameters is also investigated. The analysis shows that QCNNs have the\npotential to outperform both classical CNNs and ANN models in terms of accuracy\nand efficiency for certain applications, demonstrating their promise as a\npowerful tool in the field of machine learning.\n","authors":["Gowri Namratha Meedinti","Kandukuri Sai Srirekha","Radhakrishnan Delhibabu"],"pdf_url":"https://arxiv.org/pdf/2307.08204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08192v1","updated":"2023-07-17T01:46:15Z","published":"2023-07-17T01:46:15Z","title":"HOPE: High-order Polynomial Expansion of Black-box Neural Networks","summary":" Despite their remarkable performance, deep neural networks remain mostly\n``black boxes'', suggesting inexplicability and hindering their wide\napplications in fields requiring making rational decisions. Here we introduce\nHOPE (High-order Polynomial Expansion), a method for expanding a network into a\nhigh-order Taylor polynomial on a reference input. Specifically, we derive the\nhigh-order derivative rule for composite functions and extend the rule to\nneural networks to obtain their high-order derivatives quickly and accurately.\nFrom these derivatives, we can then derive the Taylor polynomial of the neural\nnetwork, which provides an explicit expression of the network's local\ninterpretations. Numerical analysis confirms the high accuracy, low\ncomputational complexity, and good convergence of the proposed method.\nMoreover, we demonstrate HOPE's wide applications built on deep learning,\nincluding function discovery, fast inference, and feature selection. The code\nis available at https://github.com/HarryPotterXTX/HOPE.git.\n","authors":["Tingxiong Xiao","Weihang Zhang","Yuxiao Cheng","Jinli Suo"],"pdf_url":"https://arxiv.org/pdf/2307.08192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08189v1","updated":"2023-07-17T01:35:56Z","published":"2023-07-17T01:35:56Z","title":"Mini-Giants: \"Small\" Language Models and Open Source Win-Win","summary":" ChatGPT is phenomenal. However, it is prohibitively expensive to train and\nrefine such giant models. Fortunately, small language models are flourishing\nand becoming more and more competent. We call them \"mini-giants\". We argue that\nopen source community like Kaggle and mini-giants will win-win in many ways,\ntechnically, ethically and socially. In this article, we present a brief yet\nrich background, discuss how to attain small language models, present a\ncomparative study of small language models and a brief discussion of evaluation\nmethods, discuss the application scenarios where small language models are most\nneeded in the real world, and conclude with discussion and outlook.\n","authors":["Zhengping Zhou","Lezhi Li","Xinxi Chen","Andy Li"],"pdf_url":"https://arxiv.org/pdf/2307.08189v1.pdf","comment":"16 pages, 1 figure"},{"id":"http://arxiv.org/abs/2307.08187v1","updated":"2023-07-17T01:27:10Z","published":"2023-07-17T01:27:10Z","title":"An Empirical Investigation of Pre-trained Model Selection for\n Out-of-Distribution Generalization and Calibration","summary":" In the realm of out-of-distribution generalization tasks, finetuning has\nrisen as a key strategy. While the most focus has been on optimizing learning\nalgorithms, our research highlights the influence of pre-trained model\nselection in finetuning on out-of-distribution performance and inference\nuncertainty. Balancing model size constraints of a single GPU, we examined the\nimpact of varying pre-trained datasets and model parameters on performance\nmetrics like accuracy and expected calibration error. Our findings underscore\nthe significant influence of pre-trained model selection, showing marked\nperformance improvements over algorithm choice. Larger models outperformed\nothers, though the balance between memorization and true generalization merits\nfurther investigation. Ultimately, our research emphasizes the importance of\npre-trained model selection for enhancing out-of-distribution generalization.\n","authors":["Hiroki Naganuma","Ryuichiro Hataya"],"pdf_url":"https://arxiv.org/pdf/2307.08187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09921v2","updated":"2023-07-17T00:59:31Z","published":"2023-02-20T11:34:16Z","title":"Free-Form Variational Inference for Gaussian Process State-Space Models","summary":" Gaussian process state-space models (GPSSMs) provide a principled and\nflexible approach to modeling the dynamics of a latent state, which is observed\nat discrete-time points via a likelihood model. However, inference in GPSSMs is\ncomputationally and statistically challenging due to the large number of latent\nvariables in the model and the strong temporal dependencies between them. In\nthis paper, we propose a new method for inference in Bayesian GPSSMs, which\novercomes the drawbacks of previous approaches, namely over-simplified\nassumptions, and high computational requirements. Our method is based on\nfree-form variational inference via stochastic gradient Hamiltonian Monte Carlo\nwithin the inducing-variable formalism. Furthermore, by exploiting our proposed\nvariational distribution, we provide a collapsed extension of our method where\nthe inducing variables are marginalized analytically. We also showcase results\nwhen combining our framework with particle MCMC methods. We show that, on six\nreal-world datasets, our approach can learn transition dynamics and latent\nstates more accurately than competing methods.\n","authors":["Xuhui Fan","Edwin V. Bonilla","Terence J. O'Kane","Scott A. Sisson"],"pdf_url":"https://arxiv.org/pdf/2302.09921v2.pdf","comment":"Updating to final version to appear in the proceedings"},{"id":"http://arxiv.org/abs/2307.04954v2","updated":"2023-07-17T00:15:30Z","published":"2023-07-11T00:56:44Z","title":"Hybrid hidden Markov LSTM for short-term traffic flow prediction","summary":" Deep learning (DL) methods have outperformed parametric models such as\nhistorical average, ARIMA and variants in predicting traffic variables into\nshort and near-short future, that are critical for traffic management.\nSpecifically, recurrent neural network (RNN) and its variants (e.g. long\nshort-term memory) are designed to retain long-term temporal correlations and\ntherefore are suitable for modeling sequences. However, multi-regime models\nassume the traffic system to evolve through multiple states (say, free-flow,\ncongestion in traffic) with distinct characteristics, and hence, separate\nmodels are trained to characterize the traffic dynamics within each regime. For\ninstance, Markov-switching models with a hidden Markov model (HMM) for regime\nidentification is capable of capturing complex dynamic patterns and\nnon-stationarity. Interestingly, both HMM and LSTM can be used for modeling an\nobservation sequence from a set of latent or, hidden state variables. In LSTM,\nthe latent variable is computed in a deterministic manner from the current\nobservation and the previous latent variable, while, in HMM, the set of latent\nvariables is a Markov chain. Inspired by research in natural language\nprocessing, a hybrid hidden Markov-LSTM model that is capable of learning\ncomplementary features in traffic data is proposed for traffic flow prediction.\nResults indicate significant performance gains in using hybrid architecture\ncompared to conventional methods such as Markov switching ARIMA and LSTM.\n","authors":["Agnimitra Sengupta","Adway Das","S. Ilgin Guler"],"pdf_url":"https://arxiv.org/pdf/2307.04954v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05949v2","updated":"2023-07-17T00:09:14Z","published":"2023-07-12T06:31:43Z","title":"Newell's theory based feature transformations for spatio-temporal\n traffic prediction","summary":" Deep learning (DL) models for spatio-temporal traffic flow forecasting employ\nconvolutional or graph-convolutional filters along with recurrent neural\nnetworks to capture spatial and temporal dependencies in traffic data. These\nmodels, such as CNN-LSTM, utilize traffic flows from neighboring detector\nstations to predict flows at a specific location of interest. However, these\nmodels are limited in their ability to capture the broader dynamics of the\ntraffic system, as they primarily learn features specific to the detector\nconfiguration and traffic characteristics at the target location. Hence, the\ntransferability of these models to different locations becomes challenging,\nparticularly when data is unavailable at the new location for model training.\nTo address this limitation, we propose a traffic flow physics-based feature\ntransformation for spatio-temporal DL models. This transformation incorporates\nNewell's uncongested and congested-state estimators of traffic flows at the\ntarget locations, enabling the models to learn broader dynamics of the system.\nOur methodology is empirically validated using traffic data from two different\nlocations. The results demonstrate that the proposed feature transformation\nimproves the models' performance in predicting traffic flows over different\nprediction horizons, as indicated by better goodness-of-fit statistics. An\nimportant advantage of our framework is its ability to be transferred to new\nlocations where data is unavailable. This is achieved by appropriately\naccounting for spatial dependencies based on station distances and various\ntraffic parameters. In contrast, regular DL models are not easily transferable\nas their inputs remain fixed. It should be noted that due to data limitations,\nwe were unable to perform spatial sensitivity analysis, which calls for further\nresearch using simulated data.\n","authors":["Agnimitra Sengupta","S. Ilgin Guler"],"pdf_url":"https://arxiv.org/pdf/2307.05949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08175v1","updated":"2023-07-17T00:07:52Z","published":"2023-07-17T00:07:52Z","title":"Multi-Objective Optimization of Performance and Interpretability of\n Tabular Supervised Machine Learning Models","summary":" We present a model-agnostic framework for jointly optimizing the predictive\nperformance and interpretability of supervised machine learning models for\ntabular data. Interpretability is quantified via three measures: feature\nsparsity, interaction sparsity of features, and sparsity of non-monotone\nfeature effects. By treating hyperparameter optimization of a machine learning\nalgorithm as a multi-objective optimization problem, our framework allows for\ngenerating diverse models that trade off high performance and ease of\ninterpretability in a single optimization run. Efficient optimization is\nachieved via augmentation of the search space of the learning algorithm by\nincorporating feature selection, interaction and monotonicity constraints into\nthe hyperparameter search space. We demonstrate that the optimization problem\neffectively translates to finding the Pareto optimal set of groups of selected\nfeatures that are allowed to interact in a model, along with finding their\noptimal monotonicity constraints and optimal hyperparameters of the learning\nalgorithm itself. We then introduce a novel evolutionary algorithm that can\noperate efficiently on this augmented search space. In benchmark experiments,\nwe show that our framework is capable of finding diverse models that are highly\ncompetitive or outperform state-of-the-art XGBoost or Explainable Boosting\nMachine models, both with respect to performance and interpretability.\n","authors":["Lennart Schneider","Bernd Bischl","Janek Thomas"],"pdf_url":"https://arxiv.org/pdf/2307.08175v1.pdf","comment":"Extended version of the paper accepted at GECCO 2023. 16 pages, 7\n tables, 7 figures"},{"id":"http://arxiv.org/abs/2307.08897v1","updated":"2023-07-17T23:50:51Z","published":"2023-07-17T23:50:51Z","title":"Basal-Bolus Advisor for Type 1 Diabetes (T1D) Patients Using Multi-Agent\n Reinforcement Learning (RL) Methodology","summary":" This paper presents a novel multi-agent reinforcement learning (RL) approach\nfor personalized glucose control in individuals with type 1 diabetes (T1D). The\nmethod employs a closed-loop system consisting of a blood glucose (BG)\nmetabolic model and a multi-agent soft actor-critic RL model acting as the\nbasal-bolus advisor. Performance evaluation is conducted in three scenarios,\ncomparing the RL agents to conventional therapy. Evaluation metrics include\nglucose levels (minimum, maximum, and mean), time spent in different BG ranges,\nand average daily bolus and basal insulin dosages. Results demonstrate that the\nRL-based basal-bolus advisor significantly improves glucose control, reducing\nglycemic variability and increasing time spent within the target range (70-180\nmg/dL). Hypoglycemia events are effectively prevented, and severe hyperglycemia\nevents are reduced. The RL approach also leads to a statistically significant\nreduction in average daily basal insulin dosage compared to conventional\ntherapy. These findings highlight the effectiveness of the multi-agent RL\napproach in achieving better glucose control and mitigating the risk of severe\nhyperglycemia in individuals with T1D.\n","authors":["Mehrad Jalolia","Marzia Cescon"],"pdf_url":"https://arxiv.org/pdf/2307.08897v1.pdf","comment":"8 pages, 2 figures, 1 Table"},{"id":"http://arxiv.org/abs/2307.06887v2","updated":"2023-07-17T23:45:22Z","published":"2023-07-13T16:39:08Z","title":"Provable Multi-Task Representation Learning by Two-Layer ReLU Neural\n Networks","summary":" Feature learning, i.e. extracting meaningful representations of data, is\nquintessential to the practical success of neural networks trained with\ngradient descent, yet it is notoriously difficult to explain how and why it\noccurs. Recent theoretical studies have shown that shallow neural networks\noptimized on a single task with gradient-based methods can learn meaningful\nfeatures, extending our understanding beyond the neural tangent kernel or\nrandom feature regime in which negligible feature learning occurs. But in\npractice, neural networks are increasingly often trained on {\\em many} tasks\nsimultaneously with differing loss functions, and these prior analyses do not\ngeneralize to such settings. In the multi-task learning setting, a variety of\nstudies have shown effective feature learning by simple linear models. However,\nmulti-task learning via {\\em nonlinear} models, arguably the most common\nlearning paradigm in practice, remains largely mysterious. In this work, we\npresent the first results proving feature learning occurs in a multi-task\nsetting with a nonlinear model. We show that when the tasks are binary\nclassification problems with labels depending on only $r$ directions within the\nambient $d\\gg r$-dimensional input space, executing a simple gradient-based\nmultitask learning algorithm on a two-layer ReLU neural network learns the\nground-truth $r$ directions. In particular, any downstream task on the $r$\nground-truth coordinates can be solved by learning a linear classifier with\nsample and neuron complexity independent of the ambient dimension $d$, while a\nrandom feature model requires exponential complexity in $d$ for such a\nguarantee.\n","authors":["Liam Collins","Hamed Hassani","Mahdi Soltanolkotabi","Aryan Mokhtari","Sanjay Shakkottai"],"pdf_url":"https://arxiv.org/pdf/2307.06887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08893v1","updated":"2023-07-17T23:28:59Z","published":"2023-07-17T23:28:59Z","title":"Evaluating unsupervised disentangled representation learning for genomic\n discovery and disease risk prediction","summary":" High-dimensional clinical data have become invaluable resources for genetic\nstudies, due to their accessibility in biobank-scale datasets and the\ndevelopment of high performance modeling techniques especially using deep\nlearning. Recent work has shown that low dimensional embeddings of these\nclinical data learned by variational autoencoders (VAE) can be used for\ngenome-wide association studies and polygenic risk prediction. In this work, we\nconsider multiple unsupervised learning methods for learning disentangled\nrepresentations, namely autoencoders, VAE, beta-VAE, and FactorVAE, in the\ncontext of genetic association studies. Using spirograms from UK Biobank as a\nrunning example, we observed improvements in the number of genome-wide\nsignificant loci, heritability, and performance of polygenic risk scores for\nasthma and chronic obstructive pulmonary disease by using FactorVAE or\nbeta-VAE, compared to standard VAE or non-variational autoencoders. FactorVAEs\nperformed effectively across multiple values of the regularization\nhyperparameter, while beta-VAEs were much more sensitive to the hyperparameter\nvalues.\n","authors":["Taedong Yun"],"pdf_url":"https://arxiv.org/pdf/2307.08893v1.pdf","comment":"Accepted to the 2023 ICML Workshop on Computational Biology.\n Honolulu, Hawaii, USA, 2023"},{"id":"http://arxiv.org/abs/2307.08890v1","updated":"2023-07-17T23:22:57Z","published":"2023-07-17T23:22:57Z","title":"The Predicted-Deletion Dynamic Model: Taking Advantage of ML\n Predictions, for Free","summary":" The main bottleneck in designing efficient dynamic algorithms is the unknown\nnature of the update sequence. In particular, there are some problems, like\n3-vertex connectivity, planar digraph all pairs shortest paths, and others,\nwhere the separation in runtime between the best partially dynamic solutions\nand the best fully dynamic solutions is polynomial, sometimes even exponential.\n In this paper, we formulate the predicted-deletion dynamic model, motivated\nby a recent line of empirical work about predicting edge updates in dynamic\ngraphs. In this model, edges are inserted and deleted online, and when an edge\nis inserted, it is accompanied by a \"prediction\" of its deletion time. This\nmodels real world settings where services may have access to historical data or\nother information about an input and can subsequently use such information make\npredictions about user behavior. The model is also of theoretical interest, as\nit interpolates between the partially dynamic and fully dynamic settings, and\nprovides a natural extension of the algorithms with predictions paradigm to the\ndynamic setting.\n We give a novel framework for this model that \"lifts\" partially dynamic\nalgorithms into the fully dynamic setting with little overhead. We use our\nframework to obtain improved efficiency bounds over the state-of-the-art\ndynamic algorithms for a variety of problems. In particular, we design\nalgorithms that have amortized update time that scales with a partially dynamic\nalgorithm, with high probability, when the predictions are of high quality. On\nthe flip side, our algorithms do no worse than existing fully-dynamic\nalgorithms when the predictions are of low quality. Furthermore, our algorithms\nexhibit a graceful trade-off between the two cases. Thus, we are able to take\nadvantage of ML predictions asymptotically \"for free.''\n","authors":["Quanquan C. Liu","Vaidehi Srinivas"],"pdf_url":"https://arxiv.org/pdf/2307.08890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16562v2","updated":"2023-07-17T22:41:18Z","published":"2023-05-26T01:06:44Z","title":"Unsupervised Embedding Quality Evaluation","summary":" Unsupervised learning has recently significantly gained in popularity,\nespecially with deep learning-based approaches. Despite numerous successes and\napproaching supervised-level performance on a variety of academic benchmarks,\nit is still hard to train and evaluate SSL models in practice due to the\nunsupervised nature of the problem. Even with networks trained in a supervised\nfashion, it is often unclear whether they will perform well when transferred to\nanother domain.\n Past works are generally limited to assessing the amount of information\ncontained in embeddings, which is most relevant for self-supervised learning of\ndeep neural networks. This works chooses to follow a different approach: can we\nquantify how easy it is to linearly separate the data in a stable way? We\nsurvey the literature and uncover three methods that could be potentially used\nfor evaluating quality of representations. We also introduce one novel method\nbased on recent advances in understanding the high-dimensional geometric\nstructure of self-supervised learning.\n We conduct extensive experiments and study the properties of these metrics\nand ones introduced in the previous work. Our results suggest that while there\nis no free lunch, there are metrics that can robustly estimate embedding\nquality in an unsupervised way.\n","authors":["Anton Tsitsulin","Marina Munkhoeva","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2305.16562v2.pdf","comment":"As appeared at the 2nd Annual Workshop on Topology, Algebra, and\n Geometry in Machine Learning (TAG-ML) at the 40th International Conference on\n Machine Learning (ICML), Honolulu, Hawaii, USA. 2023"},{"id":"http://arxiv.org/abs/2307.08881v1","updated":"2023-07-17T22:35:46Z","published":"2023-07-17T22:35:46Z","title":"Examining the Effects of Degree Distribution and Homophily in Graph\n Learning Models","summary":" Despite a surge in interest in GNN development, homogeneity in benchmarking\ndatasets still presents a fundamental issue to GNN research. GraphWorld is a\nrecent solution which uses the Stochastic Block Model (SBM) to generate diverse\npopulations of synthetic graphs for benchmarking any GNN task. Despite its\nsuccess, the SBM imposed fundamental limitations on the kinds of graph\nstructure GraphWorld could create.\n In this work we examine how two additional synthetic graph generators can\nimprove GraphWorld's evaluation; LFR, a well-established model in the graph\nclustering literature and CABAM, a recent adaptation of the Barabasi-Albert\nmodel tailored for GNN benchmarking. By integrating these generators, we\nsignificantly expand the coverage of graph space within the GraphWorld\nframework while preserving key graph properties observed in real-world\nnetworks. To demonstrate their effectiveness, we generate 300,000 graphs to\nbenchmark 11 GNN models on a node classification task. We find GNN performance\nvariations in response to homophily, degree distribution and feature signal.\nBased on these findings, we classify models by their sensitivity to the new\ngenerators under these properties. Additionally, we release the extensions made\nto GraphWorld on the GitHub repository, offering further evaluation of GNN\nperformance on new graphs.\n","authors":["Mustafa Yasir","John Palowitch","Anton Tsitsulin","Long Tran-Thanh","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2307.08881v1.pdf","comment":"Accepted to Workshop on Graph Learning Benchmarks at KDD 2023"},{"id":"http://arxiv.org/abs/2305.13399v4","updated":"2023-07-17T22:31:33Z","published":"2023-05-22T18:25:03Z","title":"Efficient Large-Scale Visual Representation Learning And Evaluation","summary":" In this article, we present our approach to single-modality visual\nrepresentation learning. Understanding visual representations of items is vital\nfor fashion recommendations in e-commerce. We detail and contrast techniques\nused to finetune large-scale visual representation learning models in an\nefficient manner under low-resource settings, including several pretrained\nbackbone architectures, both in the convolutional neural network as well as the\nvision transformer family. We describe the challenges for e-commerce\napplications at-scale and highlight the efforts to more efficiently train,\nevaluate, and serve visual representations. We present ablation studies\nevaluating the representation offline performance for several downstream tasks,\nincluding visually similar ad recommendations on mobile devices. To this end,\nwe present a novel multilingual text-to-image generative offline evaluation\nmethod for visually similar recommendation systems. Finally, we include online\nresults from deployed machine learning systems in production at Etsy.\n","authors":["Eden Dolev","Alaa Awad","Denisa Roberts","Zahra Ebrahimzadeh","Marcin Mejran","Vaibhav Malpani","Mahir Yavuz"],"pdf_url":"https://arxiv.org/pdf/2305.13399v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04778v2","updated":"2023-07-17T22:28:25Z","published":"2023-07-10T05:43:31Z","title":"Formulating A Strategic Plan Based On Statistical Analyses And\n Applications For Financial Companies Through A Real-World Use Case","summary":" Business statistics play a crucial role in implementing a data-driven\nstrategic plan at the enterprise level to employ various analytics where the\noutcomes of such a plan enable an enterprise to enhance the decision-making\nprocess or to mitigate risks to the organization. In this work, a strategic\nplan informed by the statistical analysis is introduced for a financial company\ncalled LendingClub, where the plan is comprised of exploring the possibility of\nonboarding a big data platform along with advanced feature selection\ncapacities. The main objectives of such a plan are to increase the company's\nrevenue while reducing the risks of granting loans to borrowers who cannot\nreturn their loans. In this study, different hypotheses formulated to address\nthe company's concerns are studied, where the results reveal that the amount of\nloans profoundly impacts the number of borrowers charging off their loans.\nAlso, the proposed strategic plan includes onboarding advanced analytics such\nas machine learning technologies that allow the company to build better\ngeneralized data-driven predictive models.\n","authors":["Saman Sarraf"],"pdf_url":"https://arxiv.org/pdf/2307.04778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08880v1","updated":"2023-07-17T22:28:16Z","published":"2023-07-17T22:28:16Z","title":"Modular Neural Network Approaches for Surgical Image Recognition","summary":" Deep learning-based applications have seen a lot of success in recent years.\nText, audio, image, and video have all been explored with great success using\ndeep learning approaches. The use of convolutional neural networks (CNN) in\ncomputer vision, in particular, has yielded reliable results. In order to\nachieve these results, a large amount of data is required. However, the dataset\ncannot always be accessible. Moreover, annotating data can be difficult and\ntime-consuming. Self-training is a semi-supervised approach that managed to\nalleviate this problem and achieve state-of-the-art performances. Theoretical\nanalysis even proved that it may result in a better generalization than a\nnormal classifier. Another problem neural networks can face is the increasing\ncomplexity of modern problems, requiring a high computational and storage cost.\nOne way to mitigate this issue, a strategy that has been inspired by human\ncognition known as modular learning, can be employed. The principle of the\napproach is to decompose a complex problem into simpler sub-tasks. This\napproach has several advantages, including faster learning, better\ngeneralization, and enables interpretability.\n In the first part of this paper, we introduce and evaluate different\narchitectures of modular learning for Dorsal Capsulo-Scapholunate Septum (DCSS)\ninstability classification. Our experiments have shown that modular learning\nimproves performances compared to non-modular systems. Moreover, we found that\nweighted modular, that is to weight the output using the probabilities from the\ngating module, achieved an almost perfect classification. In the second part,\nwe present our approach for data labeling and segmentation with self-training\napplied on shoulder arthroscopy images.\n","authors":["Nosseiba Ben Salem","Younes Bennani","Joseph Karkazan","Abir Barbara","Charles Dacheux","Thomas Gregory"],"pdf_url":"https://arxiv.org/pdf/2307.08880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16596v5","updated":"2023-07-17T22:24:16Z","published":"2022-11-29T21:15:51Z","title":"Towards Dynamic Causal Discovery with Rare Events: A Nonparametric\n Conditional Independence Test","summary":" Causal phenomena associated with rare events occur across a wide range of\nengineering problems, such as risk-sensitive safety analysis, accident analysis\nand prevention, and extreme value theory. However, current methods for causal\ndiscovery are often unable to uncover causal links, between random variables in\na dynamic setting, that manifest only when the variables first experience\nlow-probability realizations. To address this issue, we introduce a novel\nstatistical independence test on data collected from time-invariant dynamical\nsystems in which rare but consequential events occur. In particular, we exploit\nthe time-invariance of the underlying data to construct a superimposed dataset\nof the system state before rare events happen at different timesteps. We then\ndesign a conditional independence test on the reorganized data. We provide\nnon-asymptotic sample complexity bounds for the consistency of our method, and\nvalidate its performance across various simulated and real-world datasets,\nincluding incident data collected from the Caltrans Performance Measurement\nSystem (PeMS). Code containing the datasets and experiments is publicly\navailable.\n","authors":["Chih-Yuan Chiu","Kshitij Kulkarni","Shankar Sastry"],"pdf_url":"https://arxiv.org/pdf/2211.16596v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08877v1","updated":"2023-07-17T22:19:12Z","published":"2023-07-17T22:19:12Z","title":"Disentangling Node Attributes from Graph Topology for Improved\n Generalizability in Link Prediction","summary":" Link prediction is a crucial task in graph machine learning with diverse\napplications. We explore the interplay between node attributes and graph\ntopology and demonstrate that incorporating pre-trained node attributes\nimproves the generalization power of link prediction models. Our proposed\nmethod, UPNA (Unsupervised Pre-training of Node Attributes), solves the\ninductive link prediction problem by learning a function that takes a pair of\nnode attributes and predicts the probability of an edge, as opposed to Graph\nNeural Networks (GNN), which can be prone to topological shortcuts in graphs\nwith power-law degree distribution. In this manner, UPNA learns a significant\npart of the latent graph generation mechanism since the learned function can be\nused to add incoming nodes to a growing graph. By leveraging pre-trained node\nattributes, we overcome observational bias and make meaningful predictions\nabout unobserved nodes, surpassing state-of-the-art performance (3X to 34X\nimprovement on benchmark datasets). UPNA can be applied to various pairwise\nlearning tasks and integrated with existing link prediction models to enhance\ntheir generalizability and bolster graph generative models.\n","authors":["Ayan Chatterjee","Robin Walters","Giulia Menichetti","Tina Eliassi-Rad"],"pdf_url":"https://arxiv.org/pdf/2307.08877v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.08875v1","updated":"2023-07-17T22:10:20Z","published":"2023-07-17T22:10:20Z","title":"Natural Actor-Critic for Robust Reinforcement Learning with Function\n Approximation","summary":" We study robust reinforcement learning (RL) with the goal of determining a\nwell-performing policy that is robust against model mismatch between the\ntraining simulator and the testing environment. Previous policy-based robust RL\nalgorithms mainly focus on the tabular setting under uncertainty sets that\nfacilitate robust policy evaluation, but are no longer tractable when the\nnumber of states scales up. To this end, we propose two novel uncertainty set\nformulations, one based on double sampling and the other on an integral\nprobability metric. Both make large-scale robust RL tractable even when one\nonly has access to a simulator. We propose a robust natural actor-critic (RNAC)\napproach that incorporates the new uncertainty sets and employs function\napproximation. We provide finite-time convergence guarantees for the proposed\nRNAC algorithm to the optimal robust policy within the function approximation\nerror. Finally, we demonstrate the robust performance of the policy learned by\nour proposed RNAC approach in multiple MuJoCo environments and a real-world\nTurtleBot navigation task.\n","authors":["Ruida Zhou","Tao Liu","Min Cheng","Dileep Kalathil","P. R. Kumar","Chao Tian"],"pdf_url":"https://arxiv.org/pdf/2307.08875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08874v1","updated":"2023-07-17T22:09:12Z","published":"2023-07-17T22:09:12Z","title":"Latent Space Representations of Neural Algorithmic Reasoners","summary":" Neural Algorithmic Reasoning (NAR) is a research area focused on designing\nneural architectures that can reliably capture classical computation, usually\nby learning to execute algorithms. A typical approach is to rely on Graph\nNeural Network (GNN) architectures, which encode inputs in high-dimensional\nlatent spaces that are repeatedly transformed during the execution of the\nalgorithm. In this work we perform a detailed analysis of the structure of the\nlatent space induced by the GNN when executing algorithms. We identify two\npossible failure modes: (i) loss of resolution, making it hard to distinguish\nsimilar values; (ii) inability to deal with values outside the range observed\nduring training. We propose to solve the first issue by relying on a softmax\naggregator, and propose to decay the latent space in order to deal with\nout-of-range values. We show that these changes lead to improvements on the\nmajority of algorithms in the standard CLRS-30 benchmark when using the\nstate-of-the-art Triplet-GMPNN processor. Our code is available at\n\\href{https://github.com/mirjanic/nar-latent-spaces}{https://github.com/mirjanic/nar-latent-spaces}.\n","authors":["Vladimir V. Mirjanić","Razvan Pascanu","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2307.08874v1.pdf","comment":"18 pages, 17 figures, accepted at KLR Workshop at ICML 2023"},{"id":"http://arxiv.org/abs/2307.08873v1","updated":"2023-07-17T22:08:27Z","published":"2023-07-17T22:08:27Z","title":"An Alternative to Variance: Gini Deviation for Risk-averse Policy\n Gradient","summary":" Restricting the variance of a policy's return is a popular choice in\nrisk-averse Reinforcement Learning (RL) due to its clear mathematical\ndefinition and easy interpretability. Traditional methods directly restrict the\ntotal return variance. Recent methods restrict the per-step reward variance as\na proxy. We thoroughly examine the limitations of these variance-based methods,\nsuch as sensitivity to numerical scale and hindering of policy learning, and\npropose to use an alternative risk measure, Gini deviation, as a substitute. We\nstudy various properties of this new risk measure and derive a policy gradient\nalgorithm to minimize it. Empirical evaluation in domains where risk-aversion\ncan be clearly defined, shows that our algorithm can mitigate the limitations\nof variance-based risk measures and achieves high return with low risk in terms\nof variance and Gini deviation when others fail to learn a reasonable policy.\n","authors":["Yudong Luo","Guiliang Liu","Pascal Poupart","Yangchen Pan"],"pdf_url":"https://arxiv.org/pdf/2307.08873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08863v1","updated":"2023-07-17T21:40:57Z","published":"2023-07-17T21:40:57Z","title":"Meta-Value Learning: a General Framework for Learning with Learning\n Awareness","summary":" Gradient-based learning in multi-agent systems is difficult because the\ngradient derives from a first-order model which does not account for the\ninteraction between agents' learning processes. LOLA (arXiv:1709.04326)\naccounts for this by differentiating through one step of optimization. We\nextend the ideas of LOLA and develop a fully-general value-based approach to\noptimization. At the core is a function we call the meta-value, which at each\npoint in joint-policy space gives for each agent a discounted sum of its\nobjective over future optimization steps. We argue that the gradient of the\nmeta-value gives a more reliable improvement direction than the gradient of the\noriginal objective, because the meta-value derives from empirical observations\nof the effects of optimization. We show how the meta-value can be approximated\nby training a neural network to minimize TD error along optimization\ntrajectories in which agents follow the gradient of the meta-value. We analyze\nthe behavior of our method on the Logistic Game and on the Iterated Prisoner's\nDilemma.\n","authors":["Tim Cooijmans","Milad Aghajohari","Aaron Courville"],"pdf_url":"https://arxiv.org/pdf/2307.08863v1.pdf","comment":"Submitted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2307.08859v1","updated":"2023-07-17T21:33:35Z","published":"2023-07-17T21:33:35Z","title":"Curriculum Learning for Graph Neural Networks: A Multiview\n Competence-based Approach","summary":" A curriculum is a planned sequence of learning materials and an effective one\ncan make learning efficient and effective for both humans and machines. Recent\nstudies developed effective data-driven curriculum learning approaches for\ntraining graph neural networks in language applications. However, existing\ncurriculum learning approaches often employ a single criterion of difficulty in\ntheir training paradigms. In this paper, we propose a new perspective on\ncurriculum learning by introducing a novel approach that builds on graph\ncomplexity formalisms (as difficulty criteria) and model competence during\ntraining. The model consists of a scheduling scheme which derives effective\ncurricula by accounting for different views of sample difficulty and model\ncompetence during training. The proposed solution advances existing research in\ncurriculum learning for graph neural networks with the ability to incorporate a\nfine-grained spectrum of graph difficulty criteria in their training paradigms.\nExperimental results on real-world link prediction and node classification\ntasks illustrate the effectiveness of the proposed approach.\n","authors":["Nidhi Vakil","Hadi Amiri"],"pdf_url":"https://arxiv.org/pdf/2307.08859v1.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2307.08857v1","updated":"2023-07-17T21:32:51Z","published":"2023-07-17T21:32:51Z","title":"An Admissible Shift-Consistent Method for Recommender Systems","summary":" In this paper, we propose a new constraint, called shift-consistency, for\nsolving matrix/tensor completion problems in the context of recommender\nsystems. Our method provably guarantees several key mathematical properties:\n(1) satisfies a recently established admissibility criterion for recommender\nsystems; (2) satisfies a definition of fairness that eliminates a specific\nclass of potential opportunities for users to maliciously influence system\nrecommendations; and (3) offers robustness by exploiting provable uniqueness of\nmissing-value imputation. We provide a rigorous mathematical description of the\nmethod, including its generalization from matrix to tensor form to permit\nrepresentation and exploitation of complex structural relationships among sets\nof user and product attributes. We argue that our analysis suggests a\nstructured means for defining latent-space projections that can permit provable\nperformance properties to be established for machine learning methods.\n","authors":["Tung Nguyen","Jeffrey Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2307.08857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00046v2","updated":"2023-07-17T21:31:34Z","published":"2023-02-28T19:41:37Z","title":"Edit at your own risk: evaluating the robustness of edited models to\n distribution shifts","summary":" The current trend toward ever-larger models makes standard retraining\nprocedures an ever-more expensive burden. For this reason, there is growing\ninterest in model editing, which enables computationally inexpensive,\ninterpretable, post-hoc model modifications. While many model editing\ntechniques are promising, research on the properties of edited models is\nlargely limited to evaluation of validation accuracy. The robustness of edited\nmodels is an important and yet mostly unexplored topic. In this paper, we\nemploy recently developed techniques from the field of deep learning robustness\nto investigate both how model editing affects the general robustness of a\nmodel, as well as the robustness of the specific behavior targeted by the edit.\nWe find that edits tend to reduce general robustness, but that the degree of\ndegradation depends on the editing algorithm and layers chosen. Motivated by\nthese observations we introduce a new model editing algorithm, 1-layer\ninterpolation (1-LI), which uses weight-space interpolation to navigate the\ntrade-off between editing task accuracy and general robustness.\n","authors":["Davis Brown","Charles Godfrey","Cody Nizinski","Jonathan Tu","Henry Kvinge"],"pdf_url":"https://arxiv.org/pdf/2303.00046v2.pdf","comment":"DB and CG contributed equally"},{"id":"http://arxiv.org/abs/2307.08849v1","updated":"2023-07-17T21:21:18Z","published":"2023-07-17T21:21:18Z","title":"Autoregressive Diffusion Model for Graph Generation","summary":" Diffusion-based graph generative models have recently obtained promising\nresults for graph generation. However, existing diffusion-based graph\ngenerative models are mostly one-shot generative models that apply Gaussian\ndiffusion in the dequantized adjacency matrix space. Such a strategy can suffer\nfrom difficulty in model training, slow sampling speed, and incapability of\nincorporating constraints. We propose an \\emph{autoregressive diffusion} model\nfor graph generation. Unlike existing methods, we define a node-absorbing\ndiffusion process that operates directly in the discrete graph space. For\nforward diffusion, we design a \\emph{diffusion ordering network}, which learns\na data-dependent node absorbing ordering from graph topology. For reverse\ngeneration, we design a \\emph{denoising network} that uses the reverse node\nordering to efficiently reconstruct the graph by predicting the node type of\nthe new node and its edges with previously denoised nodes at a time. Based on\nthe permutation invariance of graph, we show that the two networks can be\njointly trained by optimizing a simple lower bound of data likelihood. Our\nexperiments on six diverse generic graph datasets and two molecule datasets\nshow that our model achieves better or comparable generation performance with\nprevious state-of-the-art, and meanwhile enjoys fast generation speed.\n","authors":["Lingkai Kong","Jiaming Cui","Haotian Sun","Yuchen Zhuang","B. Aditya Prakash","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08849v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2307.08847v1","updated":"2023-07-17T21:19:08Z","published":"2023-07-17T21:19:08Z","title":"Privacy-preserving patient clustering for personalized federated\n learning","summary":" Federated Learning (FL) is a machine learning framework that enables multiple\norganizations to train a model without sharing their data with a central\nserver. However, it experiences significant performance degradation if the data\nis non-identically independently distributed (non-IID). This is a problem in\nmedical settings, where variations in the patient population contribute\nsignificantly to distribution differences across hospitals. Personalized FL\naddresses this issue by accounting for site-specific distribution differences.\nClustered FL, a Personalized FL variant, was used to address this problem by\nclustering patients into groups across hospitals and training separate models\non each group. However, privacy concerns remained as a challenge as the\nclustering process requires exchange of patient-level information. This was\npreviously solved by forming clusters using aggregated data, which led to\ninaccurate groups and performance degradation. In this study, we propose\nPrivacy-preserving Community-Based Federated machine Learning (PCBFL), a novel\nClustered FL framework that can cluster patients using patient-level data while\nprotecting privacy. PCBFL uses Secure Multiparty Computation, a cryptographic\ntechnique, to securely calculate patient-level similarity scores across\nhospitals. We then evaluate PCBFL by training a federated mortality prediction\nmodel using 20 sites from the eICU dataset. We compare the performance gain\nfrom PCBFL against traditional and existing Clustered FL frameworks. Our\nresults show that PCBFL successfully forms clinically meaningful cohorts of\nlow, medium, and high-risk patients. PCBFL outperforms traditional and existing\nClustered FL frameworks with an average AUC improvement of 4.3% and AUPRC\nimprovement of 7.8%.\n","authors":["Ahmed Elhussein","Gamze Gursoy"],"pdf_url":"https://arxiv.org/pdf/2307.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.12765v2","updated":"2023-07-17T21:18:06Z","published":"2022-10-23T16:15:36Z","title":"Multi-Objective GFlowNets","summary":" We study the problem of generating diverse candidates in the context of\nMulti-Objective Optimization. In many applications of machine learning such as\ndrug discovery and material design, the goal is to generate candidates which\nsimultaneously optimize a set of potentially conflicting objectives. Moreover,\nthese objectives are often imperfect evaluations of some underlying property of\ninterest, making it important to generate diverse candidates to have multiple\noptions for expensive downstream evaluations. We propose Multi-Objective\nGFlowNets (MOGFNs), a novel method for generating diverse Pareto optimal\nsolutions, based on GFlowNets. We introduce two variants of MOGFNs: MOGFN-PC,\nwhich models a family of independent sub-problems defined by a scalarization\nfunction, with reward-conditional GFlowNets, and MOGFN-AL, which solves a\nsequence of sub-problems defined by an acquisition function in an active\nlearning loop. Our experiments on wide variety of synthetic and benchmark tasks\ndemonstrate advantages of the proposed methods in terms of the Pareto\nperformance and importantly, improved candidate diversity, which is the main\ncontribution of this work.\n","authors":["Moksh Jain","Sharath Chandra Raparthy","Alex Hernandez-Garcia","Jarrid Rector-Brooks","Yoshua Bengio","Santiago Miret","Emmanuel Bengio"],"pdf_url":"https://arxiv.org/pdf/2210.12765v2.pdf","comment":"23 pages, 8 figures. ICML 2023. Code at:\n https://github.com/GFNOrg/multi-objective-gfn"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.08339v1","updated":"2023-07-17T09:26:13Z","published":"2023-07-17T09:26:13Z","title":"Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection","summary":" Accurate and robust object detection is critical for autonomous driving.\nImage-based detectors face difficulties caused by low visibility in adverse\nweather conditions. Thus, radar-camera fusion is of particular interest but\npresents challenges in optimally fusing heterogeneous data sources. To approach\nthis issue, we propose two new radar preprocessing techniques to better align\nradar and camera data. In addition, we introduce a Multi-Task Cross-Modality\nAttention-Fusion Network (MCAF-Net) for object detection, which includes two\nnew fusion blocks. These allow for exploiting information from the feature maps\nmore comprehensively. The proposed algorithm jointly detects objects and\nsegments free space, which guides the model to focus on the more relevant part\nof the scene, namely, the occupied space. Our approach outperforms current\nstate-of-the-art radar-camera fusion-based object detectors in the nuScenes\ndataset and achieves more robust results in adverse weather conditions and\nnighttime scenarios.\n","authors":["Huawei Sun","Hao Feng","Georg Stettinger","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2307.08339v1.pdf","comment":"Accepted by ITSC 2023"},{"id":"http://arxiv.org/abs/2211.16198v3","updated":"2023-07-17T09:24:49Z","published":"2022-11-28T16:48:41Z","title":"SuS-X: Training-Free Name-Only Transfer of Vision-Language Models","summary":" Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet\neffective way to train large-scale vision-language models. CLIP demonstrates\nimpressive zero-shot classification and retrieval on diverse downstream tasks.\nHowever, to leverage its full potential, fine-tuning still appears to be\nnecessary. Fine-tuning the entire CLIP model can be resource-intensive and\nunstable. Moreover, recent methods that aim to circumvent this need for\nfine-tuning still require access to images from the target distribution. In\nthis paper, we pursue a different approach and explore the regime of\ntraining-free \"name-only transfer\" in which the only knowledge we possess about\nthe downstream task comprises the names of downstream target categories. We\npropose a novel method, SuS-X, consisting of two key building blocks -- SuS and\nTIP-X, that requires neither intensive fine-tuning nor costly labelled data.\nSuS-X achieves state-of-the-art zero-shot classification results on 19\nbenchmark datasets. We further show the utility of TIP-X in the training-free\nfew-shot setting, where we again achieve state-of-the-art results over strong\ntraining-free baselines. Code is available at\nhttps://github.com/vishaal27/SuS-X.\n","authors":["Vishaal Udandarao","Ankush Gupta","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2211.16198v3.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2307.08337v1","updated":"2023-07-17T09:16:29Z","published":"2023-07-17T09:16:29Z","title":"Power-Efficient Video Streaming on Mobile Devices Using Optimal Spatial\n Scaling","summary":" This paper derives optimal spatial scaling and rate control parameters for\npower-efficient wireless video streaming on portable devices. A video streaming\napplication is studied, which receives a high-resolution and high-quality video\nstream from a remote server and displays the content to the end-user.We show\nthat the resolution of the input video can be adjusted such that the\nquality-power trade-off is optimized. Making use of a power model from the\nliterature and subjective quality evaluation using a perceptual metric, we\nderive optimal combinations of the scaling factor and the rate-control\nparameter for encoding. For HD sequences, up to 10% of power can be saved at\nnegligible quality losses and up to 15% of power can be saved at tolerable\ndistortions. To show general validity, the method was tested for Wi-Fi and a\nmobile network as well as for two different smartphones.\n","authors":["Christian Herglotz","André Kaup","Stéphane Coulombe","Ahmad Vakili"],"pdf_url":"https://arxiv.org/pdf/2307.08337v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2212.08071v2","updated":"2023-07-17T05:44:35Z","published":"2022-12-15T18:59:59Z","title":"MAViL: Masked Audio-Video Learners","summary":" We present Masked Audio-Video Learners (MAViL) to train audio-visual\nrepresentations. Our approach learns with three complementary forms of\nself-supervision: (1) reconstruction of masked audio and video input data, (2)\nintra- and inter-modal contrastive learning with masking, and (3) self-training\nby reconstructing joint audio-video contextualized features learned from the\nfirst two objectives. Pre-training with MAViL not only enables the model to\nperform well in audio-visual classification and retrieval tasks but also\nimproves representations of each modality in isolation, without using\ninformation from the other modality for fine-tuning or inference. Empirically,\nMAViL sets a new state-of-the-art on AudioSet (53.1 mAP) and VGGSound (67.1%\naccuracy). For the first time, a self-supervised audio-visual model outperforms\nones that use external supervision on these benchmarks.\n","authors":["Po-Yao Huang","Vasu Sharma","Hu Xu","Chaitanya Ryali","Haoqi Fan","Yanghao Li","Shang-Wen Li","Gargi Ghosh","Jitendra Malik","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2212.08071v2.pdf","comment":"Technical report"}]},"2023-07-18T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.09476v1","updated":"2023-07-18T17:56:50Z","published":"2023-07-18T17:56:50Z","title":"Overthinking the Truth: Understanding how Language Models Process False\n Demonstrations","summary":" Modern language models can imitate complex patterns through few-shot\nlearning, enabling them to complete challenging tasks without fine-tuning.\nHowever, imitation can also lead models to reproduce inaccuracies or harmful\ncontent if present in the context. We study harmful imitation through the lens\nof a model's internal representations, and identify two related phenomena:\noverthinking and false induction heads. The first phenomenon, overthinking,\nappears when we decode predictions from intermediate layers, given correct vs.\nincorrect few-shot demonstrations. At early layers, both demonstrations induce\nsimilar model behavior, but the behavior diverges sharply at some \"critical\nlayer\", after which the accuracy given incorrect demonstrations progressively\ndecreases. The second phenomenon, false induction heads, are a possible\nmechanistic cause of overthinking: these are heads in late layers that attend\nto and copy false information from previous demonstrations, and whose ablation\nreduces overthinking. Beyond scientific understanding, our results suggest that\nstudying intermediate model computations could be a promising avenue for\nunderstanding and guarding against harmful model behaviors.\n","authors":["Danny Halawi","Jean-Stanislas Denain","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2307.09476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09474v1","updated":"2023-07-18T17:56:06Z","published":"2023-07-18T17:56:06Z","title":"ChatSpot: Bootstrapping Multimodal LLMs via Precise Referring\n Instruction Tuning","summary":" Human-AI interactivity is a critical aspect that reflects the usability of\nmultimodal large language models (MLLMs). However, existing end-to-end MLLMs\nonly allow users to interact with them through language instructions, leading\nto the limitation of the interactive accuracy and efficiency. In this study, we\npresent precise referring instructions that utilize diverse reference\nrepresentations such as points and boxes as referring prompts to refer to the\nspecial region. This enables MLLMs to focus on the region of interest and\nachieve finer-grained interaction. Based on precise referring instruction, we\npropose ChatSpot, a unified end-to-end multimodal large language model that\nsupports diverse forms of interactivity including mouse clicks, drag-and-drop,\nand drawing boxes, which provides a more flexible and seamless interactive\nexperience. We also construct a multi-grained vision-language\ninstruction-following dataset based on existing datasets and GPT-4 generating.\nFurthermore, we design a series of evaluation tasks to assess the effectiveness\nof region recognition and interaction. Experimental results showcase ChatSpot's\npromising performance.\n","authors":["Liang Zhao","En Yu","Zheng Ge","Jinrong Yang","Haoran Wei","Hongyu Zhou","Jianjian Sun","Yuang Peng","Runpei Dong","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09474v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2306.15656v3","updated":"2023-07-18T17:52:28Z","published":"2023-06-27T17:50:26Z","title":"SparseOptimizer: Sparsify Language Models through Moreau-Yosida\n Regularization and Accelerate via Compiler Co-design","summary":" This paper introduces SparseOptimizer, a novel deep learning optimizer that\nexploits Moreau-Yosida regularization to naturally induce sparsity in large\nlanguage models such as BERT, ALBERT and GPT. Key to the design of\nSparseOptimizer is an embedded shrinkage operator, which imparts sparsity\ndirectly within the optimization process. This operator, backed by a sound\ntheoretical framework, includes an analytical solution, thereby reinforcing the\noptimizer's robustness and efficacy. Crucially, SparseOptimizer's plug-and-play\nfunctionality eradicates the need for code modifications, making it a\nuniversally adaptable tool for a wide array of large language models. Empirical\nevaluations on benchmark datasets such as GLUE, RACE, SQuAD1, and SQuAD2\nconfirm that SparseBERT and SparseALBERT, when sparsified using\nSparseOptimizer, achieve performance comparable to their dense counterparts,\nBERT and ALBERT, while significantly reducing their parameter count. Further,\nthis work proposes an innovative optimizer-compiler co-design strategy,\ndemonstrating the potential of inference acceleration (\\textbf{3.37x},\n\\textbf{6.30x}, and \\textbf{7.15x} in comparison with Pytorch, TensorFlow, and\nLLVM generic compile, respectively) in SparseBERT when paired with an\nappropriately designed compiler. This study represents a significant step\nforward in the evolution of efficient, scalable, and high-performing large\nlanguage models, setting a precedent for future exploration and optimization in\nthis domain. The SparseOptimizer code and SparseALBERT model will be publicly\navailable upon paper acceptance.\n","authors":["Fu-Ming Guo"],"pdf_url":"https://arxiv.org/pdf/2306.15656v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09456v1","updated":"2023-07-18T17:35:45Z","published":"2023-07-18T17:35:45Z","title":"A comparative analysis of SR-GAN models","summary":" In this study, we evaluate the performance of multiple state-of-the-art SR\nGAN (Super Resolution Generative Adversarial Network) models, ESRGAN,\nReal-ESRGAN and EDSR, on a benchmark dataset of real-world images which undergo\ndegradation using a pipeline. Our results show that some models seem to\nsignificantly increase the resolution of the input images while preserving\ntheir visual quality, this is assessed using Tesseract OCR engine. We observe\nthat EDSR-BASE model from huggingface outperforms the remaining candidate\nmodels in terms of both quantitative metrics and subjective visual quality\nassessments with least compute overhead. Specifically, EDSR generates images\nwith higher peak signal-to-noise ratio (PSNR) and structural similarity index\n(SSIM) values and are seen to return high quality OCR results with Tesseract\nOCR engine. These findings suggest that EDSR is a robust and effective approach\nfor single-image super-resolution and may be particularly well-suited for\napplications where high-quality visual fidelity is critical and optimized\ncompute.\n","authors":["Fatemeh Rezapoor Nikroo","Ajinkya Deshmukh","Anantha Sharma","Adrian Tam","Kaarthik Kumar","Cleo Noris"],"pdf_url":"https://arxiv.org/pdf/2307.09456v1.pdf","comment":"9 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2307.09455v1","updated":"2023-07-18T17:29:23Z","published":"2023-07-18T17:29:23Z","title":"Pseudo Outlier Exposure for Out-of-Distribution Detection using\n Pretrained Transformers","summary":" For real-world language applications, detecting an out-of-distribution (OOD)\nsample is helpful to alert users or reject such unreliable samples. However,\nmodern over-parameterized language models often produce overconfident\npredictions for both in-distribution (ID) and OOD samples. In particular,\nlanguage models suffer from OOD samples with a similar semantic representation\nto ID samples since these OOD samples lie near the ID manifold. A rejection\nnetwork can be trained with ID and diverse outlier samples to detect test OOD\nsamples, but explicitly collecting auxiliary OOD datasets brings an additional\nburden for data collection. In this paper, we propose a simple but effective\nmethod called Pseudo Outlier Exposure (POE) that constructs a surrogate OOD\ndataset by sequentially masking tokens related to ID classes. The surrogate OOD\nsample introduced by POE shows a similar representation to ID data, which is\nmost effective in training a rejection network. Our method does not require any\nexternal OOD data and can be easily implemented within off-the-shelf\nTransformers. A comprehensive comparison with state-of-the-art algorithms\ndemonstrates POE's competitiveness on several text classification benchmarks.\n","authors":["Jaeyoung Kim","Kyuheon Jung","Dongbin Na","Sion Jang","Eunbin Park","Sungchul Choi"],"pdf_url":"https://arxiv.org/pdf/2307.09455v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.13816v3","updated":"2023-07-18T16:49:52Z","published":"2023-01-31T18:02:26Z","title":"Execution-based Code Generation using Deep Reinforcement Learning","summary":" The utilization of programming language (PL) models, pre-trained on\nlarge-scale code corpora, as a means of automating software engineering\nprocesses has demonstrated considerable potential in streamlining various code\ngeneration tasks such as code completion, code translation, and program\nsynthesis. However, current approaches mainly rely on supervised fine-tuning\nobjectives borrowed from text generation, neglecting unique sequence-level\ncharacteristics of code, including but not limited to compilability as well as\nsyntactic and functional correctness. To address this limitation, we propose\nPPOCoder, a new framework for code generation that synergistically combines\npre-trained PL models with Proximal Policy Optimization (PPO) which is a widely\nused deep reinforcement learning technique. By utilizing non-differentiable\nfeedback from code execution and structure alignment, PPOCoder seamlessly\nintegrates external code-specific knowledge into the model optimization\nprocess. It's important to note that PPOCoder is a task-agnostic and\nmodel-agnostic framework that can be used across different code generation\ntasks and PLs. Extensive experiments on three code generation tasks demonstrate\nthe effectiveness of our proposed approach compared to SOTA methods, achieving\nsignificant improvements in compilation success rates and functional\ncorrectness across different PLs.\n","authors":["Parshin Shojaee","Aneesh Jain","Sindhu Tipirneni","Chandan K. Reddy"],"pdf_url":"https://arxiv.org/pdf/2301.13816v3.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR), 2023"},{"id":"http://arxiv.org/abs/2104.06474v2","updated":"2023-07-18T16:40:41Z","published":"2021-04-13T19:34:17Z","title":"On the Interpretability and Significance of Bias Metrics in Texts: a\n PMI-based Approach","summary":" In recent years, word embeddings have been widely used to measure biases in\ntexts. Even if they have proven to be effective in detecting a wide variety of\nbiases, metrics based on word embeddings lack transparency and\ninterpretability. We analyze an alternative PMI-based metric to quantify biases\nin texts. It can be expressed as a function of conditional probabilities, which\nprovides a simple interpretation in terms of word co-occurrences. We also prove\nthat it can be approximated by an odds ratio, which allows estimating\nconfidence intervals and statistical significance of textual biases. This\napproach produces similar results to metrics based on word embeddings when\ncapturing gender gaps of the real world embedded in large corpora.\n","authors":["Francisco Valentini","Germán Rosati","Damián Blasi","Diego Fernandez Slezak","Edgar Altszyler"],"pdf_url":"https://arxiv.org/pdf/2104.06474v2.pdf","comment":"Camera Ready for ACL 2023 (main conference)"},{"id":"http://arxiv.org/abs/2307.09416v1","updated":"2023-07-18T16:33:30Z","published":"2023-07-18T16:33:30Z","title":"Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation\n Evaluation","summary":" Research in Image Generation has recently made significant progress,\nparticularly boosted by the introduction of Vision-Language models which are\nable to produce high-quality visual content based on textual inputs. Despite\nongoing advancements in terms of generation quality and realism, no methodical\nframeworks have been defined yet to quantitatively measure the quality of the\ngenerated content and the adherence with the prompted requests: so far, only\nhuman-based evaluations have been adopted for quality satisfaction and for\ncomparing different generative methods. We introduce a novel automated method\nfor Visual Concept Evaluation (ViCE), i.e. to assess consistency between a\ngenerated/edited image and the corresponding prompt/instructions, with a\nprocess inspired by the human cognitive behaviour. ViCE combines the strengths\nof Large Language Models (LLMs) and Visual Question Answering (VQA) into a\nunified pipeline, aiming to replicate the human cognitive process in quality\nassessment. This method outlines visual concepts, formulates image-specific\nverification questions, utilizes the Q&A system to investigate the image, and\nscores the combined outcome. Although this brave new hypothesis of mimicking\nhumans in the image evaluation process is in its preliminary assessment stage,\nresults are promising and open the door to a new form of automatic evaluation\nwhich could have significant impact as the image generation or the image target\nediting tasks become more and more sophisticated.\n","authors":["Federico Betti","Jacopo Staiano","Lorenzo Baraldi","Lorenzo Baraldi","Rita Cucchiara","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2307.09416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09384v1","updated":"2023-07-18T16:05:25Z","published":"2023-07-18T16:05:25Z","title":"Zero-shot Query Reformulation for Conversational Search","summary":" As the popularity of voice assistants continues to surge, conversational\nsearch has gained increased attention in Information Retrieval. However, data\nsparsity issues in conversational search significantly hinder the progress of\nsupervised conversational search methods. Consequently, researchers are\nfocusing more on zero-shot conversational search approaches. Nevertheless,\nexisting zero-shot methods face three primary limitations: they are not\nuniversally applicable to all retrievers, their effectiveness lacks sufficient\nexplainability, and they struggle to resolve common conversational ambiguities\ncaused by omission. To address these limitations, we introduce a novel\nZero-shot Query Reformulation (ZeQR) framework that reformulates queries based\non previous dialogue contexts without requiring supervision from conversational\nsearch data. Specifically, our framework utilizes language models designed for\nmachine reading comprehension tasks to explicitly resolve two common\nambiguities: coreference and omission, in raw queries. In comparison to\nexisting zero-shot methods, our approach is universally applicable to any\nretriever without additional adaptation or indexing. It also provides greater\nexplainability and effectively enhances query intent understanding because\nambiguities are explicitly and proactively resolved. Through extensive\nexperiments on four TREC conversational datasets, we demonstrate the\neffectiveness of our method, which consistently outperforms state-of-the-art\nbaselines.\n","authors":["Dayu Yang","Yue Zhang","Hui Fang"],"pdf_url":"https://arxiv.org/pdf/2307.09384v1.pdf","comment":"Accepted by ICTIR 2023"},{"id":"http://arxiv.org/abs/2305.12421v2","updated":"2023-07-18T15:41:31Z","published":"2023-05-21T10:40:55Z","title":"Evaluating Open-QA Evaluation","summary":" This study focuses on the evaluation of the Open Question Answering (Open-QA)\ntask, which can directly estimate the factuality of large language models\n(LLMs). Current automatic evaluation methods have shown limitations, indicating\nthat human evaluation still remains the most reliable approach. We introduce a\nnew task, Evaluating QA Evaluation (QA-Eval) and the corresponding dataset\nEVOUNA, designed to assess the accuracy of AI-generated answers in relation to\nstandard answers within Open-QA. Our evaluation of these methods utilizes\nhuman-annotated results to measure their performance. Specifically, the work\ninvestigates methods that show high correlation with human evaluations, deeming\nthem more reliable. We also discuss the pitfalls of current methods and methods\nto improve LLM-based evaluators. We believe this new QA-Eval task and\ncorresponding dataset EVOUNA will facilitate the development of more effective\nautomatic evaluation tools and prove valuable for future research in this area.\nAll resources are available at \\url{https://github.com/wangcunxiang/QA-Eval}\nand it is under the Apache-2.0 License.\n","authors":["Cunxiang Wang","Sirui Cheng","Qipeng Guo","Zhikun Xu","Bowen Ding","Yidong Wang","Xiangkun Hu","Zheng Zhang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.12421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09312v1","updated":"2023-07-18T14:57:12Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks. In contrast to traditional text-only methods, our approach to\nlabelling a comment as hate speech centers around the holistic analysis of text\nand images. This is done by leveraging graph transformers to capture the\ncontextual relationships in the entire discussion that surrounds a comment,\nwith interwoven fusion layers to combine text and image embeddings instead of\nprocessing different modalities separately. We compare the performance of our\nmodel to baselines that only process text; we also conduct extensive ablation\nstudies. We conclude with future work for multimodal solutions to deliver\nsocial value in online contexts, arguing that capturing a holistic view of a\nconversation greatly advances the effort to detect anti-social behavior.\n","authors":["Liam Hebert","Gaurav Sahu","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v1.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2307.09288v1","updated":"2023-07-18T14:31:57Z","published":"2023-07-18T14:31:57Z","title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","summary":" In this work, we develop and release Llama 2, a collection of pretrained and\nfine-tuned large language models (LLMs) ranging in scale from 7 billion to 70\nbillion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for\ndialogue use cases. Our models outperform open-source chat models on most\nbenchmarks we tested, and based on our human evaluations for helpfulness and\nsafety, may be a suitable substitute for closed-source models. We provide a\ndetailed description of our approach to fine-tuning and safety improvements of\nLlama 2-Chat in order to enable the community to build on our work and\ncontribute to the responsible development of LLMs.\n","authors":["Hugo Touvron","Louis Martin","Kevin Stone","Peter Albert","Amjad Almahairi","Yasmine Babaei","Nikolay Bashlykov","Soumya Batra","Prajjwal Bhargava","Shruti Bhosale","Dan Bikel","Lukas Blecher","Cristian Canton Ferrer","Moya Chen","Guillem Cucurull","David Esiobu","Jude Fernandes","Jeremy Fu","Wenyin Fu","Brian Fuller","Cynthia Gao","Vedanuj Goswami","Naman Goyal","Anthony Hartshorn","Saghar Hosseini","Rui Hou","Hakan Inan","Marcin Kardas","Viktor Kerkez","Madian Khabsa","Isabel Kloumann","Artem Korenev","Punit Singh Koura","Marie-Anne Lachaux","Thibaut Lavril","Jenya Lee","Diana Liskovich","Yinghai Lu","Yuning Mao","Xavier Martinet","Todor Mihaylov","Pushkar Mishra","Igor Molybog","Yixin Nie","Andrew Poulton","Jeremy Reizenstein","Rashi Rungta","Kalyan Saladi","Alan Schelten","Ruan Silva","Eric Michael Smith","Ranjan Subramanian","Xiaoqing Ellen Tan","Binh Tang","Ross Taylor","Adina Williams","Jian Xiang Kuan","Puxin Xu","Zheng Yan","Iliyan Zarov","Yuchen Zhang","Angela Fan","Melanie Kambadur","Sharan Narang","Aurelien Rodriguez","Robert Stojnic","Sergey Edunov","Thomas Scialom"],"pdf_url":"https://arxiv.org/pdf/2307.09288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14348v3","updated":"2023-07-18T14:20:44Z","published":"2022-10-25T21:21:17Z","title":"Synthetic Text Generation with Differential Privacy: A Simple and\n Practical Recipe","summary":" Privacy concerns have attracted increasing attention in data-driven products\ndue to the tendency of machine learning models to memorize sensitive training\ndata. Generating synthetic versions of such data with a formal privacy\nguarantee, such as differential privacy (DP), provides a promising path to\nmitigating these privacy concerns, but previous approaches in this direction\nhave typically failed to produce synthetic data of high quality. In this work,\nwe show that a simple and practical recipe in the text domain is effective:\nsimply fine-tuning a pretrained generative language model with DP enables the\nmodel to generate useful synthetic text with strong privacy protection. Through\nextensive empirical analyses on both benchmark and private customer data, we\ndemonstrate that our method produces synthetic text that is competitive in\nterms of utility with its non-private counterpart, meanwhile providing strong\nprotection against potential privacy leakages.\n","authors":["Xiang Yue","Huseyin A. Inan","Xuechen Li","Girish Kumar","Julia McAnallen","Hoda Shajari","Huan Sun","David Levitan","Robert Sim"],"pdf_url":"https://arxiv.org/pdf/2210.14348v3.pdf","comment":"ACL 2023 Main Conference (Honorable Mention)"},{"id":"http://arxiv.org/abs/2307.09274v1","updated":"2023-07-18T14:11:58Z","published":"2023-07-18T14:11:58Z","title":"Improving Text Semantic Similarity Modeling through a 3D Siamese Network","summary":" Siamese networks have gained popularity as a method for modeling text\nsemantic similarity. Traditional methods rely on pooling operation to compress\nthe semantic representations from Transformer blocks in encoding, resulting in\ntwo-dimensional semantic vectors and the loss of hierarchical semantic\ninformation from Transformer blocks. Moreover, this limited structure of\nsemantic vectors is akin to a flattened landscape, which restricts the methods\nthat can be applied in downstream modeling, as they can only navigate this flat\nterrain. To address this issue, we propose a novel 3D Siamese network for text\nsemantic similarity modeling, which maps semantic information to a\nhigher-dimensional space. The three-dimensional semantic tensors not only\nretains more precise spatial and feature domain information but also provides\nthe necessary structural condition for comprehensive downstream modeling\nstrategies to capture them. Leveraging this structural advantage, we introduce\nseveral modules to reinforce this 3D framework, focusing on three aspects:\nfeature extraction, attention, and feature fusion. Our extensive experiments on\nfour text semantic similarity benchmarks demonstrate the effectiveness and\nefficiency of our 3D Siamese Network.\n","authors":["Jianxiang Zang","Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09270v1","updated":"2023-07-18T13:56:43Z","published":"2023-07-18T13:56:43Z","title":"Linearized Relative Positional Encoding","summary":" Relative positional encoding is widely used in vanilla and linear\ntransformers to represent positional information. However, existing encoding\nmethods of a vanilla transformer are not always directly applicable to a linear\ntransformer, because the latter requires a decomposition of the query and key\nrepresentations into separate kernel functions. Nevertheless, principles for\ndesigning encoding methods suitable for linear transformers remain\nunderstudied. In this work, we put together a variety of existing linear\nrelative positional encoding approaches under a canonical form and further\npropose a family of linear relative positional encoding algorithms via unitary\ntransformation. Our formulation leads to a principled framework that can be\nused to develop new relative positional encoding methods that preserve linear\nspace-time complexity. Equipped with different models, the proposed linearized\nrelative positional encoding (LRPE) family derives effective encoding for\nvarious applications. Experiments show that compared with existing methods,\nLRPE achieves state-of-the-art performance in language modeling, text\nclassification, and image classification. Meanwhile, it emphasizes a general\nparadigm for designing broadly more relative positional encoding methods that\nare applicable to linear transformers. The code is available at\nhttps://github.com/OpenNLPLab/Lrpe.\n","authors":["Zhen Qin","Weixuan Sun","Kaiyue Lu","Hui Deng","Dongxu Li","Xiaodong Han","Yuchao Dai","Lingpeng Kong","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2307.09270v1.pdf","comment":"Reviewed by TMLR, decision pending. Yiran Zhong is the corresponding\n author. Code is available at https://github.com/OpenNLPLab/Lrpe"},{"id":"http://arxiv.org/abs/2307.09255v1","updated":"2023-07-18T13:38:39Z","published":"2023-07-18T13:38:39Z","title":"Text vectorization via transformer-based language models and n-gram\n perplexities","summary":" As the probability (and thus perplexity) of a text is calculated based on the\nproduct of the probabilities of individual tokens, it may happen that one\nunlikely token significantly reduces the probability (i.e., increase the\nperplexity) of some otherwise highly probable input, while potentially\nrepresenting a simple typographical error. Also, given that perplexity is a\nscalar value that refers to the entire input, information about the probability\ndistribution within it is lost in the calculation (a relatively good text that\nhas one unlikely token and another text in which each token is equally likely\nthey can have the same perplexity value), especially for longer texts. As an\nalternative to scalar perplexity this research proposes a simple algorithm used\nto calculate vector values based on n-gram perplexities within the input. Such\nrepresentations consider the previously mentioned aspects, and instead of a\nunique value, the relative perplexity of each text token is calculated, and\nthese values are combined into a single vector representing the input.\n","authors":["Mihailo Škorić"],"pdf_url":"https://arxiv.org/pdf/2307.09255v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.09254v1","updated":"2023-07-18T13:36:24Z","published":"2023-07-18T13:36:24Z","title":"PAC Neural Prediction Set Learning to Quantify the Uncertainty of\n Generative Language Models","summary":" Uncertainty learning and quantification of models are crucial tasks to\nenhance the trustworthiness of the models. Importantly, the recent surge of\ngenerative language models (GLMs) emphasizes the need for reliable uncertainty\nquantification due to the concerns on generating hallucinated facts. In this\npaper, we propose to learn neural prediction set models that comes with the\nprobably approximately correct (PAC) guarantee for quantifying the uncertainty\nof GLMs. Unlike existing prediction set models, which are parameterized by a\nscalar value, we propose to parameterize prediction sets via neural networks,\nwhich achieves more precise uncertainty quantification but still satisfies the\nPAC guarantee. We demonstrate the efficacy of our method on four types of\nlanguage datasets and six types of models by showing that our method improves\nthe quantified uncertainty by $63\\%$ on average, compared to a standard\nbaseline method.\n","authors":["Sangdon Park","Taesoo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.09254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15788v2","updated":"2023-07-18T13:31:56Z","published":"2023-06-27T20:37:54Z","title":"Evaluating GPT-3.5 and GPT-4 on Grammatical Error Correction for\n Brazilian Portuguese","summary":" We investigate the effectiveness of GPT-3.5 and GPT-4, two large language\nmodels, as Grammatical Error Correction (GEC) tools for Brazilian Portuguese\nand compare their performance against Microsoft Word and Google Docs. We\nintroduce a GEC dataset for Brazilian Portuguese with four categories: Grammar,\nSpelling, Internet, and Fast typing. Our results show that while GPT-4 has\nhigher recall than other methods, LLMs tend to have lower precision, leading to\novercorrection. This study demonstrates the potential of LLMs as practical GEC\ntools for Brazilian Portuguese and encourages further exploration of LLMs for\nnon-English languages and other educational settings.\n","authors":["Maria Carolina Penteado","Fábio Perez"],"pdf_url":"https://arxiv.org/pdf/2306.15788v2.pdf","comment":"Download the full source to access the dataset. Accepted to LatinX in\n AI (LXAI) Research at ICML 2023"},{"id":"http://arxiv.org/abs/2307.09249v1","updated":"2023-07-18T13:28:31Z","published":"2023-07-18T13:28:31Z","title":"UniTabE: Pretraining a Unified Tabular Encoder for Heterogeneous Tabular\n Data","summary":" Recent advancements in Natural Language Processing (NLP) have witnessed the\ngroundbreaking impact of pretrained models, yielding impressive outcomes across\nvarious tasks. This study seeks to extend the power of pretraining\nmethodologies to tabular data, a domain traditionally overlooked, yet\ninherently challenging due to the plethora of table schemas intrinsic to\ndifferent tasks. The primary research questions underpinning this work revolve\naround the adaptation to heterogeneous table structures, the establishment of a\nuniversal pretraining protocol for tabular data, the generalizability and\ntransferability of learned knowledge across tasks, the adaptation to diverse\ndownstream applications, and the incorporation of incremental columns over\ntime. In response to these challenges, we introduce UniTabE, a pioneering\nmethod designed to process tables in a uniform manner, devoid of constraints\nimposed by specific table structures. UniTabE's core concept relies on\nrepresenting each basic table element with a module, termed TabUnit. This is\nsubsequently followed by a Transformer encoder to refine the representation.\nMoreover, our model is designed to facilitate pretraining and finetuning\nthrough the utilization of free-form prompts. In order to implement the\npretraining phase, we curated an expansive tabular dataset comprising\napproximately 13 billion samples, meticulously gathered from the Kaggle\nplatform. Rigorous experimental testing and analyses were performed under a\nmyriad of scenarios to validate the effectiveness of our methodology. The\nexperimental results demonstrate UniTabE's superior performance against several\nbaseline models across a multitude of benchmark datasets. This, therefore,\nunderscores UniTabE's potential to significantly enhance the semantic\nrepresentation of tabular data, thereby marking a significant stride in the\nfield of tabular data analysis.\n","authors":["Yazheng Yang","Yuqi Wang","Guang Liu","Ledell Wu","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09249v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.09209v1","updated":"2023-07-18T12:45:54Z","published":"2023-07-18T12:45:54Z","title":"Automated Ableism: An Exploration of Explicit Disability Biases in\n Sentiment and Toxicity Analysis Models","summary":" We analyze sentiment analysis and toxicity detection models to detect the\npresence of explicit bias against people with disability (PWD). We employ the\nbias identification framework of Perturbation Sensitivity Analysis to examine\nconversations related to PWD on social media platforms, specifically Twitter\nand Reddit, in order to gain insight into how disability bias is disseminated\nin real-world social settings. We then create the \\textit{Bias Identification\nTest in Sentiment} (BITS) corpus to quantify explicit disability bias in any\nsentiment analysis and toxicity detection models. Our study utilizes BITS to\nuncover significant biases in four open AIaaS (AI as a Service) sentiment\nanalysis tools, namely TextBlob, VADER, Google Cloud Natural Language API,\nDistilBERT and two toxicity detection models, namely two versions of\nToxic-BERT. Our findings indicate that all of these models exhibit\nstatistically significant explicit bias against PWD.\n","authors":["Pranav Narayanan Venkit","Mukund Srinath","Shomir Wilson"],"pdf_url":"https://arxiv.org/pdf/2307.09209v1.pdf","comment":"TrustNLP at ACL 2023"},{"id":"http://arxiv.org/abs/2301.13066v2","updated":"2023-07-18T11:39:22Z","published":"2023-01-30T17:10:34Z","title":"A Human Word Association based model for topic detection in social\n networks","summary":" With the widespread use of social networks, detecting the topics discussed in\nthese networks has become a significant challenge. The current works are mainly\nbased on frequent pattern mining or semantic relations, and the language\nstructure is not considered. The meaning of language structural methods is to\ndiscover the relationship between words and how humans understand them.\nTherefore, this paper uses the Concept of the Imitation of the Mental Ability\nof Word Association to propose a topic detection framework in social networks.\nThis framework is based on the Human Word Association method. A special\nextraction algorithm has also been designed for this purpose. The performance\nof this method is evaluated on the FA-CUP dataset. It is a benchmark dataset in\nthe field of topic detection. The results show that the proposed method is a\ngood improvement compared to other methods, based on the Topic-recall and the\nkeyword F1 measure. Also, most of the previous works in the field of topic\ndetection are limited to the English language, and the Persian language,\nespecially microblogs written in this language, is considered a low-resource\nlanguage. Therefore, a data set of Telegram posts in the Farsi language has\nbeen collected. Applying the proposed method to this dataset also shows that\nthis method works better than other topic detection methods.\n","authors":["Mehrdad Ranjbar Khadivi","Shahin Akbarpour","Mohammad-Reza Feizi-Derakhshi","Babak Anari"],"pdf_url":"https://arxiv.org/pdf/2301.13066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09162v1","updated":"2023-07-18T11:38:45Z","published":"2023-07-18T11:38:45Z","title":"Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and\n Addressing Sociological Implications","summary":" Gender bias in artificial intelligence (AI) and natural language processing\nhas garnered significant attention due to its potential impact on societal\nperceptions and biases. This research paper aims to analyze gender bias in\nLarge Language Models (LLMs) with a focus on multiple comparisons between GPT-2\nand GPT-3.5, some prominent language models, to better understand its\nimplications. Through a comprehensive literature review, the study examines\nexisting research on gender bias in AI language models and identifies gaps in\nthe current knowledge. The methodology involves collecting and preprocessing\ndata from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis\ntechniques to evaluate gender bias in the generated text. The findings shed\nlight on gendered word associations, language usage, and biased narratives\npresent in the outputs of these Large Language Models. The discussion explores\nthe ethical implications of gender bias and its potential consequences on\nsocial perceptions and marginalized communities. Additionally, the paper\npresents strategies for reducing gender bias in LLMs, including algorithmic\napproaches and data augmentation techniques. The research highlights the\nimportance of interdisciplinary collaborations and the role of sociological\nstudies in mitigating gender bias in AI models. By addressing these issues, we\ncan pave the way for more inclusive and unbiased AI systems that have a\npositive impact on society.\n","authors":["Vishesh Thakur"],"pdf_url":"https://arxiv.org/pdf/2307.09162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09775v2","updated":"2023-07-18T10:19:50Z","published":"2023-02-20T05:46:47Z","title":"Persian topic detection based on Human Word association and graph\n embedding","summary":" In this paper, we propose a framework to detect topics in social media based\non Human Word Association. Identifying topics discussed in these media has\nbecome a critical and significant challenge. Most of the work done in this area\nis in English, but much has been done in the Persian language, especially\nmicroblogs written in Persian. Also, the existing works focused more on\nexploring frequent patterns or semantic relationships and ignored the\nstructural methods of language. In this paper, a topic detection framework\nusing HWA, a method for Human Word Association, is proposed. This method uses\nthe concept of imitation of mental ability for word association. This method\nalso calculates the Associative Gravity Force that shows how words are related.\nUsing this parameter, a graph can be generated. The topics can be extracted by\nembedding this graph and using clustering methods. This approach has been\napplied to a Persian language dataset collected from Telegram. Several\nexperimental studies have been performed to evaluate the proposed framework's\nperformance. Experimental results show that this approach works better than\nother topic detection methods.\n","authors":["Mehrdad Ranjbar-Khadivi","Shahin Akbarpour","Mohammad-Reza Feizi-Derakhshi","Babak Anari"],"pdf_url":"https://arxiv.org/pdf/2302.09775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07924v2","updated":"2023-07-18T09:51:21Z","published":"2023-07-16T02:11:34Z","title":"Communicative Agents for Software Development","summary":" Software engineering is a domain characterized by intricate decision-making\nprocesses, often relying on nuanced intuition and consultation. Recent\nadvancements in deep learning have started to revolutionize software\nengineering practices through elaborate designs implemented at various stages\nof software development. In this paper, we present an innovative paradigm that\nleverages large language models (LLMs) throughout the entire software\ndevelopment process, streamlining and unifying key processes through natural\nlanguage communication, thereby eliminating the need for specialized models at\neach phase. At the core of this paradigm lies ChatDev, a virtual chat-powered\nsoftware development company that mirrors the established waterfall model,\nmeticulously dividing the development process into four distinct chronological\nstages: designing, coding, testing, and documenting. Each stage engages a team\nof agents, such as programmers, code reviewers, and test engineers, fostering\ncollaborative dialogue and facilitating a seamless workflow. The chat chain\nacts as a facilitator, breaking down each stage into atomic subtasks. This\nenables dual roles, allowing for proposing and validating solutions through\ncontext-aware communication, leading to efficient resolution of specific\nsubtasks. The instrumental analysis of ChatDev highlights its remarkable\nefficacy in software generation, enabling the completion of the entire software\ndevelopment process in under seven minutes at a cost of less than one dollar.\nIt not only identifies and alleviates potential vulnerabilities but also\nrectifies potential hallucinations while maintaining commendable efficiency and\ncost-effectiveness. The potential of ChatDev unveils fresh possibilities for\nintegrating LLMs into the realm of software development.\n","authors":["Chen Qian","Xin Cong","Cheng Yang","Weize Chen","Yusheng Su","Juyuan Xu","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2307.07924v2.pdf","comment":"25 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.09084v1","updated":"2023-07-18T09:06:35Z","published":"2023-07-18T09:06:35Z","title":"Attention over pre-trained Sentence Embeddings for Long Document\n Classification","summary":" Despite being the current de-facto models in most NLP tasks, transformers are\noften limited to short sequences due to their quadratic attention complexity on\nthe number of tokens. Several attempts to address this issue were studied,\neither by reducing the cost of the self-attention computation or by modeling\nsmaller sequences and combining them through a recurrence mechanism or using a\nnew transformer model. In this paper, we suggest to take advantage of\npre-trained sentence transformers to start from semantically meaningful\nembeddings of the individual sentences, and then combine them through a small\nattention layer that scales linearly with the document length. We report the\nresults obtained by this simple architecture on three standard document\nclassification datasets. When compared with the current state-of-the-art models\nusing standard fine-tuning, the studied method obtains competitive results\n(even if there is no clear best model in this configuration). We also showcase\nthat the studied architecture obtains better results when freezing the\nunderlying transformers. A configuration that is useful when we need to avoid\ncomplete fine-tuning (e.g. when the same frozen transformer is shared by\ndifferent applications). Finally, two additional experiments are provided to\nfurther evaluate the relevancy of the studied architecture over simpler\nbaselines.\n","authors":["Amine Abdaoui","Sourav Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.09084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04964v2","updated":"2023-07-18T08:44:47Z","published":"2023-07-11T01:55:24Z","title":"Secrets of RLHF in Large Language Models Part I: PPO","summary":" Large language models (LLMs) have formulated a blueprint for the advancement\nof artificial general intelligence. Its primary objective is to function as a\nhuman-centric (helpful, honest, and harmless) assistant. Alignment with humans\nassumes paramount significance, and reinforcement learning with human feedback\n(RLHF) emerges as the pivotal technological paradigm underpinning this pursuit.\nCurrent technical routes usually include \\textbf{reward models} to measure\nhuman preferences, \\textbf{Proximal Policy Optimization} (PPO) to optimize\npolicy model outputs, and \\textbf{process supervision} to improve step-by-step\nreasoning capabilities. However, due to the challenges of reward design,\nenvironment interaction, and agent training, coupled with huge trial and error\ncost of large language models, there is a significant barrier for AI\nresearchers to motivate the development of technical alignment and safe landing\nof LLMs. The stable training of RLHF has still been a puzzle. In the first\nreport, we dissect the framework of RLHF, re-evaluate the inner workings of\nPPO, and explore how the parts comprising PPO algorithms impact policy agent\ntraining. We identify policy constraints being the key factor for the effective\nimplementation of the PPO algorithm. Therefore, we explore the PPO-max, an\nadvanced version of PPO algorithm, to efficiently improve the training\nstability of the policy model. Based on our main results, we perform a\ncomprehensive analysis of RLHF abilities compared with SFT models and ChatGPT.\nThe absence of open-source implementations has posed significant challenges to\nthe investigation of LLMs alignment. Therefore, we are eager to release\ntechnical reports, reward models and PPO codes, aiming to make modest\ncontributions to the advancement of LLMs.\n","authors":["Rui Zheng","Shihan Dou","Songyang Gao","Yuan Hua","Wei Shen","Binghai Wang","Yan Liu","Senjie Jin","Qin Liu","Yuhao Zhou","Limao Xiong","Lu Chen","Zhiheng Xi","Nuo Xu","Wenbin Lai","Minghao Zhu","Cheng Chang","Zhangyue Yin","Rongxiang Weng","Wensen Cheng","Haoran Huang","Tianxiang Sun","Hang Yan","Tao Gui","Qi Zhang","Xipeng Qiu","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2307.04964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09059v1","updated":"2023-07-18T08:23:46Z","published":"2023-07-18T08:23:46Z","title":"Unleashing the Imagination of Text: A Novel Framework for Text-to-image\n Person Retrieval via Exploring the Power of Words","summary":" The goal of Text-to-image person retrieval is to retrieve person images from\na large gallery that match the given textual descriptions. The main challenge\nof this task lies in the significant differences in information representation\nbetween the visual and textual modalities. The textual modality conveys\nabstract and precise information through vocabulary and grammatical structures,\nwhile the visual modality conveys concrete and intuitive information through\nimages. To fully leverage the expressive power of textual representations, it\nis essential to accurately map abstract textual descriptions to specific\nimages.\n To address this issue, we propose a novel framework to Unleash the\nImagination of Text (UIT) in text-to-image person retrieval, aiming to fully\nexplore the power of words in sentences. Specifically, the framework employs\nthe pre-trained full CLIP model as a dual encoder for the images and texts ,\ntaking advantage of prior cross-modal alignment knowledge. The Text-guided\nImage Restoration auxiliary task is proposed with the aim of implicitly mapping\nabstract textual entities to specific image regions, facilitating alignment\nbetween textual and visual embeddings. Additionally, we introduce a cross-modal\ntriplet loss tailored for handling hard samples, enhancing the model's ability\nto distinguish minor differences.\n To focus the model on the key components within sentences, we propose a novel\ntext data augmentation technique. Our proposed methods achieve state-of-the-art\nresults on three popular benchmark datasets, and the source code will be made\npublicly available shortly.\n","authors":["Delong Liu","Haiwen Li"],"pdf_url":"https://arxiv.org/pdf/2307.09059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03109v5","updated":"2023-07-18T08:11:21Z","published":"2023-07-06T16:28:35Z","title":"A Survey on Evaluation of Large Language Models","summary":" Large language models (LLMs) are gaining increasing popularity in both\nacademia and industry, owing to their unprecedented performance in various\napplications. As LLMs continue to play a vital role in both research and daily\nuse, their evaluation becomes increasingly critical, not only at the task\nlevel, but also at the society level for better understanding of their\npotential risks. Over the past years, significant efforts have been made to\nexamine LLMs from various perspectives. This paper presents a comprehensive\nreview of these evaluation methods for LLMs, focusing on three key dimensions:\nwhat to evaluate, where to evaluate, and how to evaluate. Firstly, we provide\nan overview from the perspective of evaluation tasks, encompassing general\nnatural language processing tasks, reasoning, medical usage, ethics,\neducations, natural and social sciences, agent applications, and other areas.\nSecondly, we answer the `where' and `how' questions by diving into the\nevaluation methods and benchmarks, which serve as crucial components in\nassessing performance of LLMs. Then, we summarize the success and failure cases\nof LLMs in different tasks. Finally, we shed light on several future challenges\nthat lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to\nresearchers in the realm of LLMs evaluation, thereby aiding the development of\nmore proficient LLMs. Our key point is that evaluation should be treated as an\nessential discipline to better assist the development of LLMs. We consistently\nmaintain the related open-source materials at:\nhttps://github.com/MLGroupJLU/LLM-eval-survey.\n","authors":["Yupeng Chang","Xu Wang","Jindong Wang","Yuan Wu","Kaijie Zhu","Hao Chen","Linyi Yang","Xiaoyuan Yi","Cunxiang Wang","Yidong Wang","Wei Ye","Yue Zhang","Yi Chang","Philip S. Yu","Qiang Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.03109v5.pdf","comment":"25 pages; more work is at: https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2208.02531v3","updated":"2023-07-18T08:06:19Z","published":"2022-08-04T08:56:04Z","title":"InitialGAN: A Language GAN with Completely Random Initialization","summary":" Text generative models trained via Maximum Likelihood Estimation (MLE) suffer\nfrom the notorious exposure bias problem, and Generative Adversarial Networks\n(GANs) are shown to have potential to tackle this problem. Existing language\nGANs adopt estimators like REINFORCE or continuous relaxations to model word\nprobabilities. The inherent limitations of such estimators lead current models\nto rely on pre-training techniques (MLE pre-training or pre-trained\nembeddings). Representation modeling methods which are free from those\nlimitations, however, are seldomly explored because of their poor performance\nin previous attempts. Our analyses reveal that invalid sampling methods and\nunhealthy gradients are the main contributors to such unsatisfactory\nperformance. In this work, we present two techniques to tackle these problems:\ndropout sampling and fully normalized LSTM. Based on these two techniques, we\npropose InitialGAN whose parameters are randomly initialized in full. Besides,\nwe introduce a new evaluation metric, Least Coverage Rate, to better evaluate\nthe quality of generated samples. The experimental results demonstrate that\nInitialGAN outperforms both MLE and other compared models. To the best of our\nknowledge, it is the first time a language GAN can outperform MLE without using\nany pre-training techniques.\n","authors":["Da Ren","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2208.02531v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09021v1","updated":"2023-07-18T07:20:43Z","published":"2023-07-18T07:20:43Z","title":"Towards a Neural Era in Dialogue Management for Collaboration: A\n Literature Survey","summary":" Dialogue-based human-AI collaboration can revolutionize collaborative\nproblem-solving, creative exploration, and social support. To realize this\ngoal, the development of automated agents proficient in skills such as\nnegotiating, following instructions, establishing common ground, and\nprogressing shared tasks is essential. This survey begins by reviewing the\nevolution of dialogue management paradigms in collaborative dialogue systems,\nfrom traditional handcrafted and information-state based methods to AI\nplanning-inspired approaches. It then shifts focus to contemporary data-driven\ndialogue management techniques, which seek to transfer deep learning successes\nfrom form-filling and open-domain settings to collaborative contexts. The paper\nproceeds to analyze a selected set of recent works that apply neural approaches\nto collaborative dialogue management, spotlighting prevailing trends in the\nfield. This survey hopes to provide foundational background for future\nadvancements in collaborative dialogue management, particularly as the dialogue\nsystems community continues to embrace the potential of large language models.\n","authors":["Amogh Mannekote"],"pdf_url":"https://arxiv.org/pdf/2307.09021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09014v1","updated":"2023-07-18T07:03:29Z","published":"2023-07-18T07:03:29Z","title":"Exploring acceptance of autonomous vehicle policies using KeyBERT and\n SNA: Targeting engineering students","summary":" This study aims to explore user acceptance of Autonomous Vehicle (AV)\npolicies with improved text-mining methods. Recently, South Korean policymakers\nhave viewed Autonomous Driving Car (ADC) and Autonomous Driving Robot (ADR) as\nnext-generation means of transportation that will reduce the cost of\ntransporting passengers and goods. They support the construction of V2I and V2V\ncommunication infrastructures for ADC and recognize that ADR is equivalent to\npedestrians to promote its deployment into sidewalks. To fill the gap where\nend-user acceptance of these policies is not well considered, this study\napplied two text-mining methods to the comments of graduate students in the\nfields of Industrial, Mechanical, and Electronics-Electrical-Computer. One is\nthe Co-occurrence Network Analysis (CNA) based on TF-IWF and Dice coefficient,\nand the other is the Contextual Semantic Network Analysis (C-SNA) based on both\nKeyBERT, which extracts keywords that contextually represent the comments, and\ndouble cosine similarity. The reason for comparing these approaches is to\nbalance interest not only in the implications for the AV policies but also in\nthe need to apply quality text mining to this research domain. Significantly,\nthe limitation of frequency-based text mining, which does not reflect textual\ncontext, and the trade-off of adjusting thresholds in Semantic Network Analysis\n(SNA) were considered. As the results of comparing the two approaches, the\nC-SNA provided the information necessary to understand users' voices using\nfewer nodes and features than the CNA. The users who pre-emptively understood\nthe AV policies based on their engineering literacy and the given texts\nrevealed potential risks of the AV accident policies. This study adds\nsuggestions to manage these risks to support the successful deployment of AVs\non public roads.\n","authors":["Jinwoo Ha","Dongsoo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.09014v1.pdf","comment":"29 pages with 11 figures"},{"id":"http://arxiv.org/abs/2307.09009v1","updated":"2023-07-18T06:56:08Z","published":"2023-07-18T06:56:08Z","title":"How is ChatGPT's behavior changing over time?","summary":" GPT-3.5 and GPT-4 are the two most widely used large language model (LLM)\nservices. However, when and how these models are updated over time is opaque.\nHere, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on\nfour diverse tasks: 1) solving math problems, 2) answering sensitive/dangerous\nquestions, 3) generating code and 4) visual reasoning. We find that the\nperformance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time.\nFor example, GPT-4 (March 2023) was very good at identifying prime numbers\n(accuracy 97.6%) but GPT-4 (June 2023) was very poor on these same questions\n(accuracy 2.4%). Interestingly GPT-3.5 (June 2023) was much better than GPT-3.5\n(March 2023) in this task. GPT-4 was less willing to answer sensitive questions\nin June than in March, and both GPT-4 and GPT-3.5 had more formatting mistakes\nin code generation in June than in March. Overall, our findings shows that the\nbehavior of the same LLM service can change substantially in a relatively short\namount of time, highlighting the need for continuous monitoring of LLM quality.\n","authors":["Lingjiao Chen","Matei Zaharia","James Zou"],"pdf_url":"https://arxiv.org/pdf/2307.09009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09007v1","updated":"2023-07-18T06:48:52Z","published":"2023-07-18T06:48:52Z","title":"On the (In)Effectiveness of Large Language Models for Chinese Text\n Correction","summary":" Recently, the development and progress of Large Language Models (LLMs) have\namazed the entire Artificial Intelligence community. As an outstanding\nrepresentative of LLMs and the foundation model that set off this wave of\nresearch on LLMs, ChatGPT has attracted more and more researchers to study its\ncapabilities and performance on various downstream Natural Language Processing\n(NLP) tasks. While marveling at ChatGPT's incredible performance on kinds of\ntasks, we notice that ChatGPT also has excellent multilingual processing\ncapabilities, such as Chinese. To explore the Chinese processing ability of\nChatGPT, we focus on Chinese Text Correction, a fundamental and challenging\nChinese NLP task. Specifically, we evaluate ChatGPT on the Chinese Grammatical\nError Correction (CGEC) and Chinese Spelling Check (CSC) tasks, which are two\nmain Chinese Text Correction scenarios. From extensive analyses and comparisons\nwith previous state-of-the-art fine-tuned models, we empirically find that the\nChatGPT currently has both amazing performance and unsatisfactory behavior for\nChinese Text Correction. We believe our findings will promote the landing and\napplication of LLMs in the Chinese NLP community.\n","authors":["Yinghui Li","Haojing Huang","Shirong Ma","Yong Jiang","Yangning Li","Feng Zhou","Hai-Tao Zheng","Qingyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.09007v1.pdf","comment":"Work in progress!"},{"id":"http://arxiv.org/abs/2307.08945v1","updated":"2023-07-18T03:28:03Z","published":"2023-07-18T03:28:03Z","title":"Mitigating Label Bias via Decoupled Confident Learning","summary":" Growing concerns regarding algorithmic fairness have led to a surge in\nmethodologies to mitigate algorithmic bias. However, such methodologies largely\nassume that observed labels in training data are correct. This is problematic\nbecause bias in labels is pervasive across important domains, including\nhealthcare, hiring, and content moderation. In particular, human-generated\nlabels are prone to encoding societal biases. While the presence of labeling\nbias has been discussed conceptually, there is a lack of methodologies to\naddress this problem. We propose a pruning method -- Decoupled Confident\nLearning (DeCoLe) -- specifically designed to mitigate label bias. After\nillustrating its performance on a synthetic dataset, we apply DeCoLe in the\ncontext of hate speech detection, where label bias has been recognized as an\nimportant challenge, and show that it successfully identifies biased labels and\noutperforms competing approaches.\n","authors":["Yunyi Li","Maria De-Arteaga","Maytal Saar-Tsechansky"],"pdf_url":"https://arxiv.org/pdf/2307.08945v1.pdf","comment":"AI & HCI Workshop at the 40th International Conference on Machine\n Learning (ICML), Honolulu, Hawaii, USA. 2023"},{"id":"http://arxiv.org/abs/2307.08941v1","updated":"2023-07-18T03:12:51Z","published":"2023-07-18T03:12:51Z","title":"NTK-approximating MLP Fusion for Efficient Language Model Fine-tuning","summary":" Fine-tuning a pre-trained language model (PLM) emerges as the predominant\nstrategy in many natural language processing applications. However, even\nfine-tuning the PLMs and doing inference are expensive, especially on edge\ndevices with low computing power. Some general approaches (e.g. quantization\nand distillation) have been widely studied to reduce the compute/memory of PLM\nfine-tuning, while very few one-shot compression techniques are explored. In\nthis paper, we investigate the neural tangent kernel (NTK)--which reveals the\ngradient descent dynamics of neural networks--of the multilayer perceptrons\n(MLP) modules in a PLM and propose to coin a lightweight PLM through\nNTK-approximating MLP fusion. To achieve this, we reconsider the MLP as a\nbundle of sub-MLPs, and cluster them into a given number of centroids, which\ncan then be restored as a compressed MLP and surprisingly shown to well\napproximate the NTK of the original PLM. Extensive experiments of PLM\nfine-tuning on both natural language understanding (NLU) and generation (NLG)\ntasks are provided to verify the effectiveness of the proposed method MLP\nfusion. Our code is available at https://github.com/weitianxin/MLP_Fusion.\n","authors":["Tianxin Wei","Zeming Guo","Yifan Chen","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2307.08941v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.08931v1","updated":"2023-07-18T02:38:02Z","published":"2023-07-18T02:38:02Z","title":"Teach model to answer questions after comprehending the document","summary":" Multi-choice Machine Reading Comprehension (MRC) is a challenging extension\nof Natural Language Processing (NLP) that requires the ability to comprehend\nthe semantics and logical relationships between entities in a given text. The\nMRC task has traditionally been viewed as a process of answering questions\nbased on the given text. This single-stage approach has often led the network\nto concentrate on generating the correct answer, potentially neglecting the\ncomprehension of the text itself. As a result, many prevalent models have faced\nchallenges in performing well on this task when dealing with longer texts. In\nthis paper, we propose a two-stage knowledge distillation method that teaches\nthe model to better comprehend the document by dividing the MRC task into two\nseparate stages. Our experimental results show that the student model, when\nequipped with our method, achieves significant improvements, demonstrating the\neffectiveness of our method.\n","authors":["Ruiqing Sun","Ping Jian"],"pdf_url":"https://arxiv.org/pdf/2307.08931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08925v1","updated":"2023-07-18T02:09:14Z","published":"2023-07-18T02:09:14Z","title":"Federated Large Language Model: A Position Paper","summary":" Large scale language models (LLM) have received significant attention and\nfound diverse applications across various domains, but their development\nencounters challenges in real-world scenarios. These challenges arise due to\nthe scarcity of public domain data availability and the need to maintain\nprivacy with respect to private domain data. To address these issues, federated\nlearning (FL) has emerged as a promising technology that enables collaborative\ntraining of shared models while preserving decentralized data. We propose the\nconcept of federated LLM, which comprises three key components, i.e., federated\nLLM pre-training, federated LLM fine-tuning, and federated LLM prompt\nengineering. For each component, we discuss its advantage over traditional LLM\ntraining methods and propose specific engineering strategies for\nimplementation. Furthermore, we explore the novel challenges introduced by the\nintegration of FL and LLM. We analyze existing solutions and identify potential\nobstacles faced by these solutions within the context of federated LLM.\n","authors":["Chaochao Chen","Xiaohua Feng","Jun Zhou","Jianwei Yin","Xiaolin Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08925v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.09360v3","updated":"2023-07-18T02:01:14Z","published":"2023-05-16T11:35:24Z","title":"GIFT: Graph-Induced Fine-Tuning for Multi-Party Conversation\n Understanding","summary":" Addressing the issues of who saying what to whom in multi-party conversations\n(MPCs) has recently attracted a lot of research attention. However, existing\nmethods on MPC understanding typically embed interlocutors and utterances into\nsequential information flows, or utilize only the superficial of inherent graph\nstructures in MPCs. To this end, we present a plug-and-play and lightweight\nmethod named graph-induced fine-tuning (GIFT) which can adapt various\nTransformer-based pre-trained language models (PLMs) for universal MPC\nunderstanding. In detail, the full and equivalent connections among utterances\nin regular Transformer ignore the sparse but distinctive dependency of an\nutterance on another in MPCs. To distinguish different relationships between\nutterances, four types of edges are designed to integrate graph-induced signals\ninto attention mechanisms to refine PLMs originally designed for processing\nsequential texts. We evaluate GIFT by implementing it into three PLMs, and test\nthe performance on three downstream tasks including addressee recognition,\nspeaker identification and response selection. Experimental results show that\nGIFT can significantly improve the performance of three PLMs on three\ndownstream tasks and two benchmarks with only 4 additional parameters per\nencoding layer, achieving new state-of-the-art performance on MPC\nunderstanding.\n","authors":["Jia-Chen Gu","Zhen-Hua Ling","Quan Liu","Cong Liu","Guoping Hu"],"pdf_url":"https://arxiv.org/pdf/2305.09360v3.pdf","comment":"Accepted by ACL 2023. arXiv admin note: substantial text overlap with\n arXiv:2106.01541"},{"id":"http://arxiv.org/abs/2307.08922v1","updated":"2023-07-18T01:43:00Z","published":"2023-07-18T01:43:00Z","title":"Large Language Models Perform Diagnostic Reasoning","summary":" We explore the extension of chain-of-thought (CoT) prompting to medical\nreasoning for the task of automatic diagnosis. Motivated by doctors' underlying\nreasoning process, we present Diagnostic-Reasoning CoT (DR-CoT). Empirical\nresults demonstrate that by simply prompting large language models trained only\non general text corpus with two DR-CoT exemplars, the diagnostic accuracy\nimproves by 15% comparing to standard prompting. Moreover, the gap reaches a\npronounced 18% in out-domain settings. Our findings suggest expert-knowledge\nreasoning in large language models can be elicited through proper promptings.\n","authors":["Cheng-Kuang Wu","Wei-Lin Chen","Hsin-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2307.08922v1.pdf","comment":"Accepted as a Tiny Paper at ICLR 2023 (10 pages, 5 figures)"},{"id":"http://arxiv.org/abs/2305.03642v3","updated":"2023-07-18T01:36:42Z","published":"2023-05-05T16:02:06Z","title":"Jointly Extracting Interventions, Outcomes, and Findings from RCT\n Reports with LLMs","summary":" Results from Randomized Controlled Trials (RCTs) establish the comparative\neffectiveness of interventions, and are in turn critical inputs for\nevidence-based care. However, results from RCTs are presented in (often\nunstructured) natural language articles describing the design, execution, and\noutcomes of trials; clinicians must manually extract findings pertaining to\ninterventions and outcomes of interest from such articles. This onerous manual\nprocess has motivated work on (semi-)automating extraction of structured\nevidence from trial reports. In this work we propose and evaluate a\ntext-to-text model built on instruction-tuned Large Language Models (LLMs) to\njointly extract Interventions, Outcomes, and Comparators (ICO elements) from\nclinical abstracts, and infer the associated results reported. Manual (expert)\nand automated evaluations indicate that framing evidence extraction as a\nconditional generation task and fine-tuning LLMs for this purpose realizes\nconsiderable ($\\sim$20 point absolute F1 score) gains over the previous SOTA.\nWe perform ablations and error analyses to assess aspects that contribute to\nmodel performance, and to highlight potential directions for further\nimprovements. We apply our model to a collection of published RCTs through\nmid-2022, and release a searchable database of structured findings:\nhttp://ico-relations.ebm-nlp.com\n","authors":["Somin Wadhwa","Jay DeYoung","Benjamin Nye","Silvio Amir","Byron C. Wallace"],"pdf_url":"https://arxiv.org/pdf/2305.03642v3.pdf","comment":"Accepted to MLHC 2023"},{"id":"http://arxiv.org/abs/2307.09532v1","updated":"2023-07-18T18:21:26Z","published":"2023-07-18T18:21:26Z","title":"Can Model Fusing Help Transformers in Long Document Classification? An\n Empirical Study","summary":" Text classification is an area of research which has been studied over the\nyears in Natural Language Processing (NLP). Adapting NLP to multiple domains\nhas introduced many new challenges for text classification and one of them is\nlong document classification. While state-of-the-art transformer models provide\nexcellent results in text classification, most of them have limitations in the\nmaximum sequence length of the input sequence. The majority of the transformer\nmodels are limited to 512 tokens, and therefore, they struggle with long\ndocument classification problems. In this research, we explore on employing\nModel Fusing for long document classification while comparing the results with\nwell-known BERT and Longformer architectures.\n","authors":["Damith Premasiri","Tharindu Ranasinghe","Ruslan Mitkov"],"pdf_url":"https://arxiv.org/pdf/2307.09532v1.pdf","comment":"Accepted in RANLP 2023"},{"id":"http://arxiv.org/abs/2307.10303v1","updated":"2023-07-18T18:51:06Z","published":"2023-07-18T18:51:06Z","title":"Analyzing sports commentary in order to automatically recognize events\n and extract insights","summary":" In this paper, we carefully investigate how we can use multiple different\nNatural Language Processing techniques and methods in order to automatically\nrecognize the main actions in sports events. We aim to extract insights by\nanalyzing live sport commentaries from different sources and by classifying\nthese major actions into different categories. We also study if sentiment\nanalysis could help detect these main actions.\n","authors":["Yanis Miraoui"],"pdf_url":"https://arxiv.org/pdf/2307.10303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06501v2","updated":"2023-07-18T15:05:49Z","published":"2022-08-12T21:02:35Z","title":"ForecastTKGQuestions: A Benchmark for Temporal Question Answering and\n Forecasting over Temporal Knowledge Graphs","summary":" Question answering over temporal knowledge graphs (TKGQA) has recently found\nincreasing interest. TKGQA requires temporal reasoning techniques to extract\nthe relevant information from temporal knowledge bases. The only existing TKGQA\ndataset, i.e., CronQuestions, consists of temporal questions based on the facts\nfrom a fixed time period, where a temporal knowledge graph (TKG) spanning the\nsame period can be fully used for answer inference, allowing the TKGQA models\nto use even the future knowledge to answer the questions based on the past\nfacts. In real-world scenarios, however, it is also common that given the\nknowledge until now, we wish the TKGQA systems to answer the questions asking\nabout the future. As humans constantly seek plans for the future, building\nTKGQA systems for answering such forecasting questions is important.\nNevertheless, this has still been unexplored in previous research. In this\npaper, we propose a novel task: forecasting question answering over temporal\nknowledge graphs. We also propose a large-scale TKGQA benchmark dataset, i.e.,\nForecastTKGQuestions, for this task. It includes three types of questions,\ni.e., entity prediction, yes-no, and fact reasoning questions. For every\nforecasting question in our dataset, QA models can only have access to the TKG\ninformation before the timestamp annotated in the given question for answer\ninference. We find that the state-of-the-art TKGQA methods perform poorly on\nforecasting questions, and they are unable to answer yes-no questions and fact\nreasoning questions. To this end, we propose ForecastTKGQA, a TKGQA model that\nemploys a TKG forecasting module for future inference, to answer all three\ntypes of questions. Experimental results show that ForecastTKGQA outperforms\nrecent TKGQA methods on the entity prediction questions, and it also shows\ngreat effectiveness in answering the other two types of questions.\n","authors":["Zifeng Ding","Zongyue Li","Ruoxia Qi","Jingpei Wu","Bailan He","Yunpu Ma","Zhao Meng","Shuo Chen","Ruotong Liao","Zhen Han","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2208.06501v2.pdf","comment":"Accepted to ISWC 2023"},{"id":"http://arxiv.org/abs/2307.10292v1","updated":"2023-07-18T14:32:21Z","published":"2023-07-18T14:32:21Z","title":"The Language Labyrinth: Constructive Critique on the Terminology Used in\n the AI Discourse","summary":" In the interdisciplinary field of artificial intelligence (AI) the problem of\nclear terminology is especially momentous. This paper claims, that AI debates\nare still characterised by a lack of critical distance to metaphors like\n'training', 'learning' or 'deciding'. As consequence, reflections regarding\nresponsibility or potential use-cases are greatly distorted. Yet, if relevant\ndecision-makers are convinced that AI can develop an 'understanding' or\nproperly 'interpret' issues, its regular use for sensitive tasks like deciding\nabout social benefits or judging court cases looms. The chapter argues its\nclaim by analysing central notions of the AI debate and tries to contribute by\nproposing more fitting terminology and hereby enabling more fruitful debates.\nIt is a conceptual work at the intersection of critical computer science and\nphilosophy of language.\n","authors":["Rainer Rehak"],"pdf_url":"https://arxiv.org/pdf/2307.10292v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2307.10291v1","updated":"2023-07-18T14:30:36Z","published":"2023-07-18T14:30:36Z","title":"Mutual Reinforcement Effects in Japanese Sentence Classification and\n Named Entity Recognition Tasks","summary":" Information extraction(IE) is a crucial subfield within natural language\nprocessing. However, for the traditionally segmented approach to sentence\nclassification and Named Entity Recognition, the intricate interactions between\nthese individual subtasks remain largely uninvestigated. In this study, we\npropose an integrative analysis, converging sentence classification with Named\nEntity Recognition, with the objective to unveil and comprehend the mutual\nreinforcement effect within these two information extraction subtasks. To\nachieve this, we introduce a Sentence Classification and Named Entity\nRecognition Multi-task (SCNM) approach that combines Sentence Classification\n(SC) and Named Entity Recognition (NER). We develop a Sentence-to-Label\nGeneration (SLG) framework for SCNM and construct a Wikipedia dataset\ncontaining both SC and NER. Using a format converter, we unify input formats\nand employ a generative model to generate SC-labels, NER-labels, and associated\ntext segments. We propose a Constraint Mechanism (CM) to improve generated\nformat accuracy. Our results show SC accuracy increased by 1.13 points and NER\nby 1.06 points in SCNM compared to standalone tasks, with CM raising format\naccuracy from 63.61 to 100. The findings indicate mutual reinforcement effects\nbetween SC and NER, and integration enhances both tasks' performance. We\nadditionally implemented the SLG framework on single SC task. It yielded\nsuperior accuracies compared to the baseline on two distinct Japanese SC\ndatasets. Notably, in the experiment of few-shot learning, SLG framework shows\nmuch better performance than fine-tune method. These empirical findings\ncontribute additional evidence to affirm the efficacy of the SLG framework.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2307.10291v1.pdf","comment":"25 pages, 12 figures, 19 tables. arXiv admin note: substantial text\n overlap with arXiv:2306.15978"},{"id":"http://arxiv.org/abs/2307.10274v1","updated":"2023-07-18T06:45:43Z","published":"2023-07-18T06:45:43Z","title":"Zero-shot Domain-sensitive Speech Recognition with Prompt-conditioning\n Fine-tuning","summary":" In this work, we propose a method to create domain-sensitive speech\nrecognition models that utilize textual domain information by conditioning its\ngeneration on a given text prompt. This is accomplished by fine-tuning a\npre-trained, end-to-end model (Whisper) to learn from demonstrations with\nprompt examples. We show that this ability can be generalized to different\ndomains and even various prompt contexts, with our model gaining a Word Error\nRate (WER) reduction of up to 33% on unseen datasets from various domains, such\nas medical conversation, air traffic control communication, and financial\nmeetings. Considering the limited availability of audio-transcript pair data,\nwe further extend our method to text-only fine-tuning to achieve domain\nsensitivity as well as domain adaptation. We demonstrate that our text-only\nfine-tuned model can also attend to various prompt contexts, with the model\nreaching the most WER reduction of 29% on the medical conversation dataset.\n","authors":["Feng-Ting Liao","Yung-Chieh Chan","Yi-Chang Chen","Chan-Jan Hsu","Da-shan Shiu"],"pdf_url":"https://arxiv.org/pdf/2307.10274v1.pdf","comment":"F-T Liao and Y-C Chan contributed equally"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.09481v1","updated":"2023-07-18T17:59:02Z","published":"2023-07-18T17:59:02Z","title":"AnyDoor: Zero-shot Object-level Image Customization","summary":" This work presents AnyDoor, a diffusion-based image generator with the power\nto teleport target objects to new scenes at user-specified locations in a\nharmonious way. Instead of tuning parameters for each object, our model is\ntrained only once and effortlessly generalizes to diverse object-scene\ncombinations at the inference stage. Such a challenging zero-shot setting\nrequires an adequate characterization of a certain object. To this end, we\ncomplement the commonly used identity feature with detail features, which are\ncarefully designed to maintain texture details yet allow versatile local\nvariations (e.g., lighting, orientation, posture, etc.), supporting the object\nin favorably blending with different surroundings. We further propose to borrow\nknowledge from video datasets, where we can observe various forms (i.e., along\nthe time axis) of a single object, leading to stronger model generalizability\nand robustness. Extensive experiments demonstrate the superiority of our\napproach over existing alternatives as well as its great potential in\nreal-world applications, such as virtual try-on and object moving. Project page\nis https://damo-vilab.github.io/AnyDoor-Page/.\n","authors":["Xi Chen","Lianghua Huang","Yu Liu","Yujun Shen","Deli Zhao","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.09481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09480v1","updated":"2023-07-18T17:58:22Z","published":"2023-07-18T17:58:22Z","title":"FACTS: Facial Animation Creation using the Transfer of Styles","summary":" The ability to accurately capture and express emotions is a critical aspect\nof creating believable characters in video games and other forms of\nentertainment. Traditionally, this animation has been achieved with artistic\neffort or performance capture, both requiring costs in time and labor. More\nrecently, audio-driven models have seen success, however, these often lack\nexpressiveness in areas not correlated to the audio signal. In this paper, we\npresent a novel approach to facial animation by taking existing animations and\nallowing for the modification of style characteristics. Specifically, we\nexplore the use of a StarGAN to enable the conversion of 3D facial animations\ninto different emotions and person-specific styles. We are able to maintain the\nlip-sync of the animations with this method thanks to the use of a novel\nviseme-preserving loss.\n","authors":["Jack Saunders","Steven Caulkin","Vinay Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2307.09480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09474v1","updated":"2023-07-18T17:56:06Z","published":"2023-07-18T17:56:06Z","title":"ChatSpot: Bootstrapping Multimodal LLMs via Precise Referring\n Instruction Tuning","summary":" Human-AI interactivity is a critical aspect that reflects the usability of\nmultimodal large language models (MLLMs). However, existing end-to-end MLLMs\nonly allow users to interact with them through language instructions, leading\nto the limitation of the interactive accuracy and efficiency. In this study, we\npresent precise referring instructions that utilize diverse reference\nrepresentations such as points and boxes as referring prompts to refer to the\nspecial region. This enables MLLMs to focus on the region of interest and\nachieve finer-grained interaction. Based on precise referring instruction, we\npropose ChatSpot, a unified end-to-end multimodal large language model that\nsupports diverse forms of interactivity including mouse clicks, drag-and-drop,\nand drawing boxes, which provides a more flexible and seamless interactive\nexperience. We also construct a multi-grained vision-language\ninstruction-following dataset based on existing datasets and GPT-4 generating.\nFurthermore, we design a series of evaluation tasks to assess the effectiveness\nof region recognition and interaction. Experimental results showcase ChatSpot's\npromising performance.\n","authors":["Liang Zhao","En Yu","Zheng Ge","Jinrong Yang","Haoran Wei","Hongyu Zhou","Jianjian Sun","Yuang Peng","Runpei Dong","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09474v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.09472v1","updated":"2023-07-18T17:55:29Z","published":"2023-07-18T17:55:29Z","title":"GroupLane: End-to-End 3D Lane Detection with Channel-wise Grouping","summary":" Efficiency is quite important for 3D lane detection due to practical\ndeployment demand. In this work, we propose a simple, fast, and end-to-end\ndetector that still maintains high detection precision. Specifically, we devise\na set of fully convolutional heads based on row-wise classification. In\ncontrast to previous counterparts, ours supports recognizing both vertical and\nhorizontal lanes. Besides, our method is the first one to perform row-wise\nclassification in bird-eye-view. In the heads, we split feature into multiple\ngroups and every group of feature corresponds to a lane instance. During\ntraining, the predictions are associated with lane labels using the proposed\nsingle-win one-to-one matching to compute loss, and no post-processing\noperation is demanded for inference. In this way, our proposed fully\nconvolutional detector, GroupLane, realizes end-to-end detection like DETR.\nEvaluated on 3 real world 3D lane benchmarks, OpenLane, Once-3DLanes, and\nOpenLane-Huawei, GroupLane adopting ConvNext-Base as the backbone outperforms\nthe published state-of-the-art PersFormer by 13.6% F1 score in the OpenLane\nvalidation set. Besides, GroupLane with ResNet18 still surpasses PersFormer by\n4.9% F1 score, while the inference speed is nearly 7x faster and the FLOPs is\nonly 13.3% of it.\n","authors":["Zhuoling Li","Chunrui Han","Zheng Ge","Jinrong Yang","En Yu","Haoqian Wang","Hengshuang Zhao","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09465v1","updated":"2023-07-18T17:47:24Z","published":"2023-07-18T17:47:24Z","title":"Occlusion Aware Student Emotion Recognition based on Facial Action Unit\n Detection","summary":" Given that approximately half of science, technology, engineering, and\nmathematics (STEM) undergraduate students in U.S. colleges and universities\nleave by the end of the first year [15], it is crucial to improve the quality\nof classroom environments. This study focuses on monitoring students' emotions\nin the classroom as an indicator of their engagement and proposes an approach\nto address this issue. The impact of different facial parts on the performance\nof an emotional recognition model is evaluated through experimentation. To test\nthe proposed model under partial occlusion, an artificially occluded dataset is\nintroduced. The novelty of this work lies in the proposal of an occlusion-aware\narchitecture for facial action units (AUs) extraction, which employs attention\nmechanism and adaptive feature learning. The AUs can be used later to classify\nfacial expressions in classroom settings.\n This research paper's findings provide valuable insights into handling\nocclusion in analyzing facial images for emotional engagement analysis. The\nproposed experiments demonstrate the significance of considering occlusion and\nenhancing the reliability of facial analysis models in classroom environments.\nThese findings can also be extended to other settings where occlusions are\nprevalent.\n","authors":["Shrouk Wally","Ahmed Elsayed","Islam Alkabbany","Asem Ali","Aly Farag"],"pdf_url":"https://arxiv.org/pdf/2307.09465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09456v1","updated":"2023-07-18T17:35:45Z","published":"2023-07-18T17:35:45Z","title":"A comparative analysis of SR-GAN models","summary":" In this study, we evaluate the performance of multiple state-of-the-art SR\nGAN (Super Resolution Generative Adversarial Network) models, ESRGAN,\nReal-ESRGAN and EDSR, on a benchmark dataset of real-world images which undergo\ndegradation using a pipeline. Our results show that some models seem to\nsignificantly increase the resolution of the input images while preserving\ntheir visual quality, this is assessed using Tesseract OCR engine. We observe\nthat EDSR-BASE model from huggingface outperforms the remaining candidate\nmodels in terms of both quantitative metrics and subjective visual quality\nassessments with least compute overhead. Specifically, EDSR generates images\nwith higher peak signal-to-noise ratio (PSNR) and structural similarity index\n(SSIM) values and are seen to return high quality OCR results with Tesseract\nOCR engine. These findings suggest that EDSR is a robust and effective approach\nfor single-image super-resolution and may be particularly well-suited for\napplications where high-quality visual fidelity is critical and optimized\ncompute.\n","authors":["Fatemeh Rezapoor Nikroo","Ajinkya Deshmukh","Anantha Sharma","Adrian Tam","Kaarthik Kumar","Cleo Noris"],"pdf_url":"https://arxiv.org/pdf/2307.09456v1.pdf","comment":"9 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2207.02159v4","updated":"2023-07-18T17:23:25Z","published":"2022-07-05T16:26:05Z","title":"Robustness Analysis of Video-Language Models Against Visual and Language\n Perturbations","summary":" Joint visual and language modeling on large-scale datasets has recently shown\ngood progress in multi-modal tasks when compared to single modal learning.\nHowever, robustness of these approaches against real-world perturbations has\nnot been studied. In this work, we perform the first extensive robustness study\nof video-language models against various real-world perturbations. We focus on\ntext-to-video retrieval and propose two large-scale benchmark datasets,\nMSRVTT-P and YouCook2-P, which utilize 90 different visual and 35 different\ntext perturbations. The study reveals some interesting initial findings from\nthe studied models: 1) models are generally more susceptible when only video is\nperturbed as opposed to when only text is perturbed, 2) models that are\npre-trained are more robust than those trained from scratch, 3) models attend\nmore to scene and objects rather than motion and action. We hope this study\nwill serve as a benchmark and guide future research in robust video-language\nlearning. The benchmark introduced in this study along with the code and\ndatasets is available at https://bit.ly/3CNOly4.\n","authors":["Madeline C. Schiappa","Shruti Vyas","Hamid Palangi","Yogesh S. Rawat","Vibhav Vineet"],"pdf_url":"https://arxiv.org/pdf/2207.02159v4.pdf","comment":"NeurIPS 2022 Datasets and Benchmarks Track. This projects webpage is\n located at https://bit.ly/3CNOly4"},{"id":"http://arxiv.org/abs/2307.01740v2","updated":"2023-07-18T17:22:39Z","published":"2023-07-04T14:16:49Z","title":"Synchronous Image-Label Diffusion Probability Model with Application to\n Stroke Lesion Segmentation on Non-contrast CT","summary":" Stroke lesion volume is a key radiologic measurement for assessing the\nprognosis of Acute Ischemic Stroke (AIS) patients, which is challenging to be\nautomatically measured on Non-Contrast CT (NCCT) scans. Recent diffusion\nprobabilistic models have shown potentials of being used for image\nsegmentation. In this paper, a novel Synchronous image-label Diffusion\nProbability Model (SDPM) is proposed for stroke lesion segmentation on NCCT\nusing Markov diffusion process. The proposed SDPM is fully based on a Latent\nVariable Model (LVM), offering a complete probabilistic elaboration. An\nadditional net-stream, parallel with a noise prediction stream, is introduced\nto obtain initial noisy label estimates for efficiently inferring the final\nlabels. By optimizing the specified variational boundaries, the trained model\ncan infer multiple label estimates for reference given the input images with\nnoises. The proposed model was assessed on three stroke lesion datasets\nincluding one public and two private datasets. Compared to several U-net and\ntransformer-based segmentation methods, our proposed SDPM model is able to\nachieve state-of-the-art performance. The code is publicly available.\n","authors":["Jianhai Zhang","Tonghua Wan","Ethan MacDonald","Bijoy Menon","Aravind Ganesh","Qiu Wu"],"pdf_url":"https://arxiv.org/pdf/2307.01740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09437v1","updated":"2023-07-18T17:11:55Z","published":"2023-07-18T17:11:55Z","title":"Unsupervised Conditional Slot Attention for Object Centric Learning","summary":" Extracting object-level representations for downstream reasoning tasks is an\nemerging area in AI. Learning object-centric representations in an unsupervised\nsetting presents multiple challenges, a key one being binding an arbitrary\nnumber of object instances to a specialized object slot. Recent object-centric\nrepresentation methods like Slot Attention utilize iterative attention to learn\ncomposable representations with dynamic inference level binding but fail to\nachieve specialized slot level binding. To address this, in this paper we\npropose Unsupervised Conditional Slot Attention using a novel Probabilistic\nSlot Dictionary (PSD). We define PSD with (i) abstract object-level property\nvectors as key and (ii) parametric Gaussian distribution as its corresponding\nvalue. We demonstrate the benefits of the learnt specific object-level\nconditioning distributions in multiple downstream tasks, namely object\ndiscovery, compositional scene generation, and compositional visual reasoning.\nWe show that our method provides scene composition capabilities and a\nsignificant boost in a few shot adaptability tasks of compositional visual\nreasoning, while performing similarly or better than slot attention in object\ndiscovery tasks\n","authors":["Avinash Kori","Francesco Locatello","Francesca Toni","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2307.09437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09420v1","updated":"2023-07-18T16:37:37Z","published":"2023-07-18T16:37:37Z","title":"Measuring Student Behavioral Engagement using Histogram of Actions","summary":" In this paper, we propose a novel technique for measuring behavioral\nengagement through students' actions recognition. The proposed approach\nrecognizes student actions then predicts the student behavioral engagement\nlevel. For student action recognition, we use human skeletons to model student\npostures and upper body movements. To learn the dynamics of student upper body,\na 3D-CNN model is used. The trained 3D-CNN model is used to recognize actions\nwithin every 2minute video segment then these actions are used to build a\nhistogram of actions which encodes the student actions and their frequencies.\nThis histogram is utilized as an input to SVM classifier to classify whether\nthe student is engaged or disengaged. To evaluate the proposed framework, we\nbuild a dataset consisting of 1414 2-minute video segments annotated with 13\nactions and 112 video segments annotated with two engagement levels.\nExperimental results indicate that student actions can be recognized with top 1\naccuracy 83.63% and the proposed framework can capture the average engagement\nof the class.\n","authors":["Ahmed Abdelkawy","Islam Alkabbany","Asem Ali","Aly Farag"],"pdf_url":"https://arxiv.org/pdf/2307.09420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07813v2","updated":"2023-07-18T16:35:36Z","published":"2023-07-15T14:34:25Z","title":"Ultra-Fast and Ultra-Low-Power In-Sensor Edge Vision for Gaze Estimation","summary":" Intelligent edge vision tasks encounter the critical challenge of ensuring\npower and latency efficiency due to the typically heavy computational load they\nimpose on edge platforms.This work leverages one of the first \"AI in sensor\"\nvision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power\nend-to-end edge vision applications. We evaluate the IMX500 and compare it to\nother edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by\nexploring gaze estimation as a case study. We propose TinyTracker, a highly\nefficient, fully quantized model for 2D gaze estimation designed to maximize\nthe performance of the edge vision systems considered in this study.\nTinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1]\nwithout significant loss in gaze estimation accuracy (maximum of 0.16 cm when\nfully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor\nresults in end-to-end latency of around 19ms. The camera takes around 17.9ms to\nread, process and transmit the pixels to the accelerator. The inference time of\nthe network is 0.86ms with an additional 0.24 ms for retrieving the results\nfrom the sensor. The overall energy consumption of the end-to-end system is 4.9\nmJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is\n1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ\nVS 34.2mJ)\n","authors":["Pietro Bonazzi","Thomas Ruegg","Sizhen Bian","Yawei Li","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2307.07813v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09416v1","updated":"2023-07-18T16:33:30Z","published":"2023-07-18T16:33:30Z","title":"Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation\n Evaluation","summary":" Research in Image Generation has recently made significant progress,\nparticularly boosted by the introduction of Vision-Language models which are\nable to produce high-quality visual content based on textual inputs. Despite\nongoing advancements in terms of generation quality and realism, no methodical\nframeworks have been defined yet to quantitatively measure the quality of the\ngenerated content and the adherence with the prompted requests: so far, only\nhuman-based evaluations have been adopted for quality satisfaction and for\ncomparing different generative methods. We introduce a novel automated method\nfor Visual Concept Evaluation (ViCE), i.e. to assess consistency between a\ngenerated/edited image and the corresponding prompt/instructions, with a\nprocess inspired by the human cognitive behaviour. ViCE combines the strengths\nof Large Language Models (LLMs) and Visual Question Answering (VQA) into a\nunified pipeline, aiming to replicate the human cognitive process in quality\nassessment. This method outlines visual concepts, formulates image-specific\nverification questions, utilizes the Q&A system to investigate the image, and\nscores the combined outcome. Although this brave new hypothesis of mimicking\nhumans in the image evaluation process is in its preliminary assessment stage,\nresults are promising and open the door to a new form of automatic evaluation\nwhich could have significant impact as the image generation or the image target\nediting tasks become more and more sophisticated.\n","authors":["Federico Betti","Jacopo Staiano","Lorenzo Baraldi","Lorenzo Baraldi","Rita Cucchiara","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2307.09416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06849v2","updated":"2023-07-18T16:20:43Z","published":"2023-06-12T03:47:43Z","title":"Mitigating Transformer Overconfidence via Lipschitz Regularization","summary":" Though Transformers have achieved promising results in many computer vision\ntasks, they tend to be over-confident in predictions, as the standard Dot\nProduct Self-Attention (DPSA) can barely preserve distance for the unbounded\ninput domain. In this work, we fill this gap by proposing a novel Lipschitz\nRegularized Transformer (LRFormer). Specifically, we present a new similarity\nfunction with the distance within Banach Space to ensure the Lipschitzness and\nalso regularize the term by a contractive Lipschitz Bound. The proposed method\nis analyzed with a theoretical guarantee, providing a rigorous basis for its\neffectiveness and reliability. Extensive experiments conducted on standard\nvision benchmarks demonstrate that our method outperforms the state-of-the-art\nsingle forward pass approaches in prediction, calibration, and uncertainty\nestimation.\n","authors":["Wenqian Ye","Yunsheng Ma","Xu Cao","Kun Tang"],"pdf_url":"https://arxiv.org/pdf/2306.06849v2.pdf","comment":"Accepted by UAI 2023. (https://proceedings.mlr.press/v216/ye23a.html)"},{"id":"http://arxiv.org/abs/2302.14581v2","updated":"2023-07-18T16:07:55Z","published":"2023-02-28T14:03:40Z","title":"HopFIR: Hop-wise GraphFormer with Intragroup Joint Refinement for 3D\n Human Pose Estimation","summary":" 2D-to-3D human pose lifting is fundamental for 3D human pose estimation\n(HPE). Graph Convolutional Network (GCN) has been proven inherently suitable to\nmodel the human skeletal topology. However, current GCN-based 3D HPE methods\nupdate the node features by aggregating their neighbors' information without\nconsidering the interaction of joints in different motion patterns. Although\nsome studies import limb information to learn the movement patterns, the latent\nsynergies among joints, such as maintaining balance in the motion are seldom\ninvestigated. We propose a hop-wise GraphFormer with intragroup joint\nrefinement (HopFIR) to tackle the 3D HPE problem. The HopFIR mainly consists of\na novel Hop-wise GraphFormer(HGF) module and an Intragroup Joint\nRefinement(IJR) module which leverages the prior limb information for\nperipheral joints refinement. The HGF module groups the joints by $k$-hop\nneighbors and utilizes a hop-wise transformer-like attention mechanism among\nthese groups to discover latent joint synergy. Extensive experimental results\nshow that HopFIR outperforms the SOTA methods with a large margin (on the\nHuman3.6M dataset, the mean per joint position error (MPJPE) is 32.67mm).\nFurthermore, it is also demonstrated that previous SOTA GCN-based methods can\nbenefit from the proposed hop-wise attention mechanism efficiently with\nsignificant performance promotion, such as SemGCN and MGCN are improved by 8.9%\nand 4.5%, respectively.\n","authors":["Kai Zhai","Qiang Nie","Bo Ouyang","Xiang Li","ShanLin Yang"],"pdf_url":"https://arxiv.org/pdf/2302.14581v2.pdf","comment":"we will re-upload the newest version soon"},{"id":"http://arxiv.org/abs/2307.09368v1","updated":"2023-07-18T15:50:04Z","published":"2023-07-18T15:50:04Z","title":"Plug the Leaks: Advancing Audio-driven Talking Face Generation by\n Preventing Unintended Information Flow","summary":" Audio-driven talking face generation is the task of creating a\nlip-synchronized, realistic face video from given audio and reference frames.\nThis involves two major challenges: overall visual quality of generated images\non the one hand, and audio-visual synchronization of the mouth part on the\nother hand. In this paper, we start by identifying several problematic aspects\nof synchronization methods in recent audio-driven talking face generation\napproaches. Specifically, this involves unintended flow of lip and pose\ninformation from the reference to the generated image, as well as instabilities\nduring model training. Subsequently, we propose various techniques for\nobviating these issues: First, a silent-lip reference image generator prevents\nleaking of lips from the reference to the generated image. Second, an adaptive\ntriplet loss handles the pose leaking problem. Finally, we propose a stabilized\nformulation of synchronization loss, circumventing aforementioned training\ninstabilities while additionally further alleviating the lip leaking issue.\nCombining the individual improvements, we present state-of-the art performance\non LRS2 and LRW in both synchronization and visual quality. We further validate\nour design in various ablation experiments, confirming the individual\ncontributions as well as their complementary effects.\n","authors":["Dogucan Yaman","Fevziye Irem Eyiokur","Leonard Bärmann","Hazim Kemal Ekenel","Alexander Waibel"],"pdf_url":"https://arxiv.org/pdf/2307.09368v1.pdf","comment":"Submitted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09365v1","updated":"2023-07-18T15:48:53Z","published":"2023-07-18T15:48:53Z","title":"An Evaluation of Zero-Cost Proxies -- from Neural Architecture\n Performance to Model Robustness","summary":" Zero-cost proxies are nowadays frequently studied and used to search for\nneural architectures. They show an impressive ability to predict the\nperformance of architectures by making use of their untrained weights. These\ntechniques allow for immense search speed-ups. So far the joint search for\nwell-performing and robust architectures has received much less attention in\nthe field of NAS. Therefore, the main focus of zero-cost proxies is the clean\naccuracy of architectures, whereas the model robustness should play an evenly\nimportant part. In this paper, we analyze the ability of common zero-cost\nproxies to serve as performance predictors for robustness in the popular\nNAS-Bench-201 search space. We are interested in the single prediction task for\nrobustness and the joint multi-objective of clean and robust accuracy. We\nfurther analyze the feature importance of the proxies and show that predicting\nthe robustness makes the prediction task from existing zero-cost proxies more\nchallenging. As a result, the joint consideration of several proxies becomes\nnecessary to predict a model's robustness while the clean accuracy can be\nregressed from a single such feature.\n","authors":["Jovita Lukasik","Michael Moeller","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.09365v1.pdf","comment":"Accepted at DAGM GCPR 2023"},{"id":"http://arxiv.org/abs/2307.09362v1","updated":"2023-07-18T15:46:21Z","published":"2023-07-18T15:46:21Z","title":"Disentangle then Parse:Night-time Semantic Segmentation with\n Illumination Disentanglement","summary":" Most prior semantic segmentation methods have been developed for day-time\nscenes, while typically underperforming in night-time scenes due to\ninsufficient and complicated lighting conditions. In this work, we tackle this\nchallenge by proposing a novel night-time semantic segmentation paradigm, i.e.,\ndisentangle then parse (DTP). DTP explicitly disentangles night-time images\ninto light-invariant reflectance and light-specific illumination components and\nthen recognizes semantics based on their adaptive fusion. Concretely, the\nproposed DTP comprises two key components: 1) Instead of processing\nlighting-entangled features as in prior works, our Semantic-Oriented\nDisentanglement (SOD) framework enables the extraction of reflectance component\nwithout being impeded by lighting, allowing the network to consistently\nrecognize the semantics under cover of varying and complicated lighting\nconditions. 2) Based on the observation that the illumination component can\nserve as a cue for some semantically confused regions, we further introduce an\nIllumination-Aware Parser (IAParser) to explicitly learn the correlation\nbetween semantics and lighting, and aggregate the illumination features to\nyield more precise predictions. Extensive experiments on the night-time\nsegmentation task with various settings demonstrate that DTP significantly\noutperforms state-of-the-art methods. Furthermore, with negligible additional\nparameters, DTP can be directly used to benefit existing day-time methods for\nnight-time segmentation.\n","authors":["Zhixiang Wei","Lin Chen","Tao Tu","Huaian Chen","Pengyang Ling","Yi Jin"],"pdf_url":"https://arxiv.org/pdf/2307.09362v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.09361v1","updated":"2023-07-18T15:46:20Z","published":"2023-07-18T15:46:20Z","title":"MOCA: Self-supervised Representation Learning by Predicting Masked\n Online Codebook Assignments","summary":" Self-supervised learning can be used for mitigating the greedy needs of\nVision Transformer networks for very large fully-annotated datasets. Different\nclasses of self-supervised learning offer representations with either good\ncontextual reasoning properties, e.g., using masked image modeling strategies,\nor invariance to image perturbations, e.g., with contrastive methods. In this\nwork, we propose a single-stage and standalone method, MOCA, which unifies both\ndesired properties using novel mask-and-predict objectives defined with\nhigh-level features (instead of pixel-level details). Moreover, we show how to\neffectively employ both learning paradigms in a synergistic and\ncomputation-efficient way. Doing so, we achieve new state-of-the-art results on\nlow-shot settings and strong experimental results in various evaluation\nprotocols with a training that is at least 3 times faster than prior methods.\n","authors":["Spyros Gidaris","Andrei Bursuc","Oriane Simeoni","Antonin Vobecky","Nikos Komodakis","Matthieu Cord","Patrick Pérez"],"pdf_url":"https://arxiv.org/pdf/2307.09361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09356v1","updated":"2023-07-18T15:43:35Z","published":"2023-07-18T15:43:35Z","title":"OnlineRefer: A Simple Online Baseline for Referring Video Object\n Segmentation","summary":" Referring video object segmentation (RVOS) aims at segmenting an object in a\nvideo following human instruction. Current state-of-the-art methods fall into\nan offline pattern, in which each clip independently interacts with text\nembedding for cross-modal understanding. They usually present that the offline\npattern is necessary for RVOS, yet model limited temporal association within\neach clip. In this work, we break up the previous offline belief and propose a\nsimple yet effective online model using explicit query propagation, named\nOnlineRefer. Specifically, our approach leverages target cues that gather\nsemantic information and position prior to improve the accuracy and ease of\nreferring predictions for the current frame. Furthermore, we generalize our\nonline model into a semi-online framework to be compatible with video-based\nbackbones. To show the effectiveness of our method, we evaluate it on four\nbenchmarks, \\ie, Refer-Youtube-VOS, Refer-DAVIS17, A2D-Sentences, and\nJHMDB-Sentences. Without bells and whistles, our OnlineRefer with a Swin-L\nbackbone achieves 63.5 J&F and 64.8 J&F on Refer-Youtube-VOS and Refer-DAVIS17,\noutperforming all other offline methods.\n","authors":["Dongming Wu","Tiancai Wang","Yuang Zhang","Xiangyu Zhang","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2307.09356v1.pdf","comment":"Accepted by ICCV2023. The code is at\n https://github.com/wudongming97/OnlineRefer"},{"id":"http://arxiv.org/abs/2307.09351v1","updated":"2023-07-18T15:37:35Z","published":"2023-07-18T15:37:35Z","title":"SphereNet: Learning a Noise-Robust and General Descriptor for Point\n Cloud Registration","summary":" Point cloud registration is to estimate a transformation to align point\nclouds collected in different perspectives. In learning-based point cloud\nregistration, a robust descriptor is vital for high-accuracy registration.\nHowever, most methods are susceptible to noise and have poor generalization\nability on unseen datasets. Motivated by this, we introduce SphereNet to learn\na noise-robust and unseen-general descriptor for point cloud registration. In\nour method, first, the spheroid generator builds a geometric domain based on\nspherical voxelization to encode initial features. Then, the spherical\ninterpolation of the sphere is introduced to realize robustness against noise.\nFinally, a new spherical convolutional neural network with spherical integrity\npadding completes the extraction of descriptors, which reduces the loss of\nfeatures and fully captures the geometric features. To evaluate our methods, a\nnew benchmark 3DMatch-noise with strong noise is introduced. Extensive\nexperiments are carried out on both indoor and outdoor datasets. Under\nhigh-intensity noise, SphereNet increases the feature matching recall by more\nthan 25 percentage points on 3DMatch-noise. In addition, it sets a new\nstate-of-the-art performance for the 3DMatch and 3DLoMatch benchmarks with\n93.5\\% and 75.6\\% registration recall and also has the best generalization\nability on unseen datasets.\n","authors":["Guiyu Zhao","Zhentao Guo","Xin Wang","Hongbin Ma"],"pdf_url":"https://arxiv.org/pdf/2307.09351v1.pdf","comment":"15 pages, under review for IEEE Transactions on Circuits and Systems\n for Video Technology"},{"id":"http://arxiv.org/abs/2307.09330v1","updated":"2023-07-18T15:13:15Z","published":"2023-07-18T15:13:15Z","title":"Visual Validation versus Visual Estimation: A Study on the Average Value\n in Scatterplots","summary":" We investigate the ability of individuals to visually validate statistical\nmodels in terms of their fit to the data. While visual model estimation has\nbeen studied extensively, visual model validation remains under-investigated.\nIt is unknown how well people are able to visually validate models, and how\ntheir performance compares to visual and computational estimation. As a\nstarting point, we conducted a study across two populations (crowdsourced and\nvolunteers). Participants had to both visually estimate (i.e, draw) and\nvisually validate (i.e., accept or reject) the frequently studied model of\naverages. Across both populations, the level of accuracy of the models that\nwere considered valid was lower than the accuracy of the estimated models. We\nfind that participants' validation and estimation were unbiased. Moreover,\ntheir natural critical point between accepting and rejecting a given mean value\nis close to the boundary of its 95% confidence interval, indicating that the\nvisually perceived confidence interval corresponds to a common statistical\nstandard. Our work contributes to the understanding of visual model validation\nand opens new research opportunities.\n","authors":["Daniel Braun","Ashley Suh","Remco Chang","Michael Gleicher","Tatiana von Landesberger"],"pdf_url":"https://arxiv.org/pdf/2307.09330v1.pdf","comment":"Preprint and Author Version of a Short Paper, accepted to the 2023\n IEEE Visualization Conference (VIS)"},{"id":"http://arxiv.org/abs/2307.09329v1","updated":"2023-07-18T15:11:40Z","published":"2023-07-18T15:11:40Z","title":"Towards a performance analysis on pre-trained Visual Question Answering\n models for autonomous driving","summary":" This short paper presents a preliminary analysis of three popular Visual\nQuestion Answering (VQA) models, namely ViLBERT, ViLT, and LXMERT, in the\ncontext of answering questions relating to driving scenarios. The performance\nof these models is evaluated by comparing the similarity of responses to\nreference answers provided by computer vision experts. Model selection is\npredicated on the analysis of transformer utilization in multimodal\narchitectures. The results indicate that models incorporating cross-modal\nattention and late fusion techniques exhibit promising potential for generating\nimproved answers within a driving perspective. This initial analysis serves as\na launchpad for a forthcoming comprehensive comparative study involving nine\nVQA models and sets the scene for further investigations into the effectiveness\nof VQA model queries in self-driving scenarios. Supplementary material is\navailable at\nhttps://github.com/KaavyaRekanar/Towards-a-performance-analysis-on-pre-trained-VQA-models-for-autonomous-driving.\n","authors":["Kaavya Rekanar","Ciarán Eising","Ganesh Sistu","Martin Hayes"],"pdf_url":"https://arxiv.org/pdf/2307.09329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09323v1","updated":"2023-07-18T15:07:39Z","published":"2023-07-18T15:07:39Z","title":"Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking\n Portrait Synthesis","summary":" This paper presents ER-NeRF, a novel conditional Neural Radiance Fields\n(NeRF) based architecture for talking portrait synthesis that can concurrently\nachieve fast convergence, real-time rendering, and state-of-the-art performance\nwith small model size. Our idea is to explicitly exploit the unequal\ncontribution of spatial regions to guide talking portrait modeling.\nSpecifically, to improve the accuracy of dynamic head reconstruction, a compact\nand expressive NeRF-based Tri-Plane Hash Representation is introduced by\npruning empty spatial regions with three planar hash encoders. For speech\naudio, we propose a Region Attention Module to generate region-aware condition\nfeature via an attention mechanism. Different from existing methods that\nutilize an MLP-based encoder to learn the cross-modal relation implicitly, the\nattention mechanism builds an explicit connection between audio features and\nspatial regions to capture the priors of local motions. Moreover, a direct and\nfast Adaptive Pose Encoding is introduced to optimize the head-torso separation\nproblem by mapping the complex transformation of the head pose into spatial\ncoordinates. Extensive experiments demonstrate that our method renders better\nhigh-fidelity and audio-lips synchronized talking portrait videos, with\nrealistic details and high efficiency compared to previous methods.\n","authors":["Jiahe Li","Jiawei Zhang","Xiao Bai","Jun Zhou","Lin Gu"],"pdf_url":"https://arxiv.org/pdf/2307.09323v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09316v1","updated":"2023-07-18T14:59:19Z","published":"2023-07-18T14:59:19Z","title":"MarS3D: A Plug-and-Play Motion-Aware Model for Semantic Segmentation on\n Multi-Scan 3D Point Clouds","summary":" 3D semantic segmentation on multi-scan large-scale point clouds plays an\nimportant role in autonomous systems. Unlike the single-scan-based semantic\nsegmentation task, this task requires distinguishing the motion states of\npoints in addition to their semantic categories. However, methods designed for\nsingle-scan-based segmentation tasks perform poorly on the multi-scan task due\nto the lacking of an effective way to integrate temporal information. We\npropose MarS3D, a plug-and-play motion-aware module for semantic segmentation\non multi-scan 3D point clouds. This module can be flexibly combined with\nsingle-scan models to allow them to have multi-scan perception abilities. The\nmodel encompasses two key designs: the Cross-Frame Feature Embedding module for\nenriching representation learning and the Motion-Aware Feature Learning module\nfor enhancing motion awareness. Extensive experiments show that MarS3D can\nimprove the performance of the baseline model by a large margin. The code is\navailable at https://github.com/CVMI-Lab/MarS3D.\n","authors":["Jiahui Liu","Chirui Chang","Jianhui Liu","Xiaoyang Wu","Lan Ma","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2307.09316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01580v3","updated":"2023-07-18T14:57:36Z","published":"2023-04-04T07:17:31Z","title":"Untargeted Near-collision Attacks in Biometric Recognition","summary":" A biometric recognition system can operate in two distinct modes,\nidentification or verification. In the first mode, the system recognizes an\nindividual by searching the enrolled templates of all the users for a match. In\nthe second mode, the system validates a user's identity claim by comparing the\nfresh provided template with the enrolled template. The biometric\ntransformation schemes usually produce binary templates that are better handled\nby cryptographic schemes, and the comparison is based on a distance that leaks\ninformation about the similarities between two biometric templates. Both the\nexperimentally determined false match rate and false non-match rate through\nrecognition threshold adjustment define the recognition accuracy, and hence the\nsecurity of the system. To the best of our knowledge, few works provide a\nformal treatment of the security under minimum leakage of information, i.e.,\nthe binary outcome of a comparison with a threshold. In this paper, we rely on\nprobabilistic modelling to quantify the security strength of binary templates.\nWe investigate the influence of template size, database size and threshold on\nthe probability of having a near-collision. We highlight several untargeted\nattacks on biometric systems considering naive and adaptive adversaries.\nInterestingly, these attacks can be launched both online and offline and, both\nin the identification mode and in the verification mode. We discuss the choice\nof parameters through the generic presented attacks.\n","authors":["Axel Durbet","Paul-Marie Grollemund","Kevin Thiry-Atighehchi"],"pdf_url":"https://arxiv.org/pdf/2304.01580v3.pdf","comment":"Addition of results and correction of typos"},{"id":"http://arxiv.org/abs/2307.09306v1","updated":"2023-07-18T14:52:08Z","published":"2023-07-18T14:52:08Z","title":"EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory\n Forecasting","summary":" Capturing high-dimensional social interactions and feasible futures is\nessential for predicting trajectories. To address this complex nature, several\nattempts have been devoted to reducing the dimensionality of the output\nvariables via parametric curve fitting such as the B\\'ezier curve and B-spline\nfunction. However, these functions, which originate in computer graphics\nfields, are not suitable to account for socially acceptable human dynamics. In\nthis paper, we present EigenTrajectory ($\\mathbb{ET}$), a trajectory prediction\napproach that uses a novel trajectory descriptor to form a compact space, known\nhere as $\\mathbb{ET}$ space, in place of Euclidean space, for representing\npedestrian movements. We first reduce the complexity of the trajectory\ndescriptor via a low-rank approximation. We transform the pedestrians' history\npaths into our $\\mathbb{ET}$ space represented by spatio-temporal principle\ncomponents, and feed them into off-the-shelf trajectory forecasting models. The\ninputs and outputs of the models as well as social interactions are all\ngathered and aggregated in the corresponding $\\mathbb{ET}$ space. Lastly, we\npropose a trajectory anchor-based refinement method to cover all possible\nfutures in the proposed $\\mathbb{ET}$ space. Extensive experiments demonstrate\nthat our EigenTrajectory predictor can significantly improve both the\nprediction accuracy and reliability of existing trajectory forecasting models\non public benchmarks, indicating that the proposed descriptor is suited to\nrepresent pedestrian behaviors. Code is publicly available at\nhttps://github.com/inhwanbae/EigenTrajectory .\n","authors":["Inhwan Bae","Jean Oh","Hae-Gon Jeon"],"pdf_url":"https://arxiv.org/pdf/2307.09306v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09302v1","updated":"2023-07-18T14:40:48Z","published":"2023-07-18T14:40:48Z","title":"Conformal prediction under ambiguous ground truth","summary":" In safety-critical classification tasks, conformal prediction allows to\nperform rigorous uncertainty quantification by providing confidence sets\nincluding the true class with a user-specified probability. This generally\nassumes the availability of a held-out calibration set with access to ground\ntruth labels. Unfortunately, in many domains, such labels are difficult to\nobtain and usually approximated by aggregating expert opinions. In fact, this\nholds true for almost all datasets, including well-known ones such as CIFAR and\nImageNet. Applying conformal prediction using such labels underestimates\nuncertainty. Indeed, when expert opinions are not resolvable, there is inherent\nambiguity present in the labels. That is, we do not have ``crisp'', definitive\nground truth labels and this uncertainty should be taken into account during\ncalibration. In this paper, we develop a conformal prediction framework for\nsuch ambiguous ground truth settings which relies on an approximation of the\nunderlying posterior distribution of labels given inputs. We demonstrate our\nmethodology on synthetic and real datasets, including a case study of skin\ncondition classification in dermatology.\n","authors":["David Stutz","Abhijit Guha Roy","Tatiana Matejovicova","Patricia Strachan","Ali Taylan Cemgil","Arnaud Doucet"],"pdf_url":"https://arxiv.org/pdf/2307.09302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09283v1","updated":"2023-07-18T14:24:33Z","published":"2023-07-18T14:24:33Z","title":"RepViT: Revisiting Mobile CNN From ViT Perspective","summary":" Recently, lightweight Vision Transformers (ViTs) demonstrate superior\nperformance and lower latency compared with lightweight Convolutional Neural\nNetworks (CNNs) on resource-constrained mobile devices. This improvement is\nusually attributed to the multi-head self-attention module, which enables the\nmodel to learn global representations. However, the architectural disparities\nbetween lightweight ViTs and lightweight CNNs have not been adequately\nexamined. In this study, we revisit the efficient design of lightweight CNNs\nand emphasize their potential for mobile devices. We incrementally enhance the\nmobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by\nintegrating the efficient architectural choices of lightweight ViTs. This ends\nup with a new family of pure lightweight CNNs, namely RepViT. Extensive\nexperiments show that RepViT outperforms existing state-of-the-art lightweight\nViTs and exhibits favorable latency in various vision tasks. On ImageNet,\nRepViT achieves over 80\\% top-1 accuracy with nearly 1ms latency on an iPhone\n12, which is the first time for a lightweight model, to the best of our\nknowledge. Our largest model, RepViT-M3, obtains 81.4\\% accuracy with only\n1.3ms latency. The code and trained models are available at\n\\url{https://github.com/jameslahm/RepViT}.\n","authors":["Ao Wang","Hui Chen","Zijia Lin","Hengjun Pu","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2307.09283v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.09279v1","updated":"2023-07-18T14:19:28Z","published":"2023-07-18T14:19:28Z","title":"Regression-free Blind Image Quality Assessment","summary":" Regression-based blind image quality assessment (IQA) models are susceptible\nto biased training samples, leading to a biased estimation of model parameters.\nTo mitigate this issue, we propose a regression-free framework for image\nquality evaluation, which is founded upon retrieving similar instances by\nincorporating semantic and distortion features. The motivation behind this\napproach is rooted in the observation that the human visual system (HVS) has\nanalogous visual responses to semantically similar image contents degraded by\nthe same distortion. The proposed framework comprises two classification-based\nmodules: semantic-based classification (SC) module and distortion-based\nclassification (DC) module. Given a test image and an IQA database, the SC\nmodule retrieves multiple pristine images based on semantic similarity. The DC\nmodule then retrieves instances based on distortion similarity from the\ndistorted images that correspond to each retrieved pristine image. Finally, the\npredicted quality score is derived by aggregating the subjective quality scores\nof multiple retrieved instances. Experimental results on four benchmark\ndatabases validate that the proposed model can remarkably outperform the\nstate-of-the-art regression-based models.\n","authors":["Xiaoqi Wang","Jian Xiong","Hao Gao","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2307.09279v1.pdf","comment":"11 pages, 7 figures, 50 conferences"},{"id":"http://arxiv.org/abs/2307.08535v2","updated":"2023-07-18T14:11:18Z","published":"2023-07-17T14:52:52Z","title":"Multi-class point cloud completion networks for 3D cardiac anatomy\n reconstruction from cine magnetic resonance images","summary":" Cine magnetic resonance imaging (MRI) is the current gold standard for the\nassessment of cardiac anatomy and function. However, it typically only acquires\na set of two-dimensional (2D) slices of the underlying three-dimensional (3D)\nanatomy of the heart, thus limiting the understanding and analysis of both\nhealthy and pathological cardiac morphology and physiology. In this paper, we\npropose a novel fully automatic surface reconstruction pipeline capable of\nreconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI\nacquisitions. Its key component is a multi-class point cloud completion network\n(PCCN) capable of correcting both the sparsity and misalignment issues of the\n3D reconstruction task in a unified model. We first evaluate the PCCN on a\nlarge synthetic dataset of biventricular anatomies and observe Chamfer\ndistances between reconstructed and gold standard anatomies below or similar to\nthe underlying image resolution for multiple levels of slice misalignment.\nFurthermore, we find a reduction in reconstruction error compared to a\nbenchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean\nsurface distance, respectively. We then apply the PCCN as part of our automated\nreconstruction pipeline to 1000 subjects from the UK Biobank study in a\ncross-domain transfer setting and demonstrate its ability to reconstruct\naccurate and topologically plausible biventricular heart meshes with clinical\nmetrics comparable to the previous literature. Finally, we investigate the\nrobustness of our proposed approach and observe its capacity to successfully\nhandle multiple common outlier conditions.\n","authors":["Marcel Beetz","Abhirup Banerjee","Julius Ossenberg-Engels","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.08535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09267v1","updated":"2023-07-18T13:49:49Z","published":"2023-07-18T13:49:49Z","title":"Distilling Coarse-to-Fine Semantic Matching Knowledge for Weakly\n Supervised 3D Visual Grounding","summary":" 3D visual grounding involves finding a target object in a 3D scene that\ncorresponds to a given sentence query. Although many approaches have been\nproposed and achieved impressive performance, they all require dense\nobject-sentence pair annotations in 3D point clouds, which are both\ntime-consuming and expensive. To address the problem that fine-grained\nannotated data is difficult to obtain, we propose to leverage weakly supervised\nannotations to learn the 3D visual grounding model, i.e., only coarse\nscene-sentence correspondences are used to learn object-sentence links. To\naccomplish this, we design a novel semantic matching model that analyzes the\nsemantic similarity between object proposals and sentences in a coarse-to-fine\nmanner. Specifically, we first extract object proposals and coarsely select the\ntop-K candidates based on feature and class similarity matrices. Next, we\nreconstruct the masked keywords of the sentence using each candidate one by\none, and the reconstructed accuracy finely reflects the semantic similarity of\neach candidate to the query. Additionally, we distill the coarse-to-fine\nsemantic matching knowledge into a typical two-stage 3D visual grounding model,\nwhich reduces inference costs and improves performance by taking full advantage\nof the well-studied structure of the existing architectures. We conduct\nextensive experiments on ScanRefer, Nr3D, and Sr3D, which demonstrate the\neffectiveness of our proposed method.\n","authors":["Zehan Wang","Haifeng Huang","Yang Zhao","Linjun Li","Xize Cheng","Yichen Zhu","Aoxiong Yin","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.09267v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09264v1","updated":"2023-07-18T13:49:00Z","published":"2023-07-18T13:49:00Z","title":"Knowledge Distillation for Object Detection: from generic to remote\n sensing datasets","summary":" Knowledge distillation, a well-known model compression technique, is an\nactive research area in both computer vision and remote sensing communities. In\nthis paper, we evaluate in a remote sensing context various off-the-shelf\nobject detection knowledge distillation methods which have been originally\ndeveloped on generic computer vision datasets such as Pascal VOC. In\nparticular, methods covering both logit mimicking and feature imitation\napproaches are applied for vehicle detection using the well-known benchmarks\nsuch as xView and VEDAI datasets. Extensive experiments are performed to\ncompare the relative performance and interrelationships of the methods.\nExperimental results show high variations and confirm the importance of result\naggregation and cross validation on remote sensing datasets.\n","authors":["Hoàng-Ân Lê","Minh-Tan Pham"],"pdf_url":"https://arxiv.org/pdf/2307.09264v1.pdf","comment":"Accepted for publishing at IGARSS 2023"},{"id":"http://arxiv.org/abs/2307.09262v1","updated":"2023-07-18T13:47:57Z","published":"2023-07-18T13:47:57Z","title":"Neuromorphic spintronics simulated using an unconventional data-driven\n Thiele equation approach","summary":" In this study, we developed a quantitative description of the dynamics of\nspin-torque vortex nano-oscillators (STVOs) through an unconventional model\nbased on the combination of the Thiele equation approach (TEA) and data from\nmicromagnetic simulations (MMS). Solving the STVO dynamics with our analytical\nmodel allows to accelerate the simulations by 9 orders of magnitude compared to\nMMS while reaching the same level of accuracy. Here, we showcase our model by\nsimulating a STVO-based neural network for solving a classification task. We\nassess its performance with respect to the input signal current intensity and\nthe level of noise that might affect such a system. Our approach is promising\nfor accelerating the design of STVO-based neuromorphic computing devices while\ndecreasing drastically its computational cost.\n","authors":["Anatole Moureaux","Simon de Wergifosse","Chloé Chopin","Flavio Abreu Araujo"],"pdf_url":"https://arxiv.org/pdf/2307.09262v1.pdf","comment":"Presented in ISCS2023"},{"id":"http://arxiv.org/abs/2307.09259v1","updated":"2023-07-18T13:43:53Z","published":"2023-07-18T13:43:53Z","title":"Adaptive Topological Feature via Persistent Homology: Filtration\n Learning for Point Clouds","summary":" Machine learning for point clouds has been attracting much attention, with\nmany applications in various fields, such as shape recognition and material\nscience. To enhance the accuracy of such machine learning methods, it is known\nto be effective to incorporate global topological features, which are typically\nextracted by persistent homology. In the calculation of persistent homology for\na point cloud, we need to choose a filtration for the point clouds, an\nincreasing sequence of spaces. Because the performance of machine learning\nmethods combined with persistent homology is highly affected by the choice of a\nfiltration, we need to tune it depending on data and tasks. In this paper, we\npropose a framework that learns a filtration adaptively with the use of neural\nnetworks. In order to make the resulting persistent homology\nisometry-invariant, we develop a neural network architecture with such\ninvariance. Additionally, we theoretically show a finite-dimensional\napproximation result that justifies our architecture. Experimental results\ndemonstrated the efficacy of our framework in several classification tasks.\n","authors":["Naoki Nishikawa","Yuichi Ike","Kenji Yamanishi"],"pdf_url":"https://arxiv.org/pdf/2307.09259v1.pdf","comment":"17 pages with 4 figures"},{"id":"http://arxiv.org/abs/2303.05118v2","updated":"2023-07-18T13:29:53Z","published":"2023-03-09T08:57:01Z","title":"SLCA: Slow Learner with Classifier Alignment for Continual Learning on a\n Pre-trained Model","summary":" The goal of continual learning is to improve the performance of recognition\nmodels in learning sequentially arrived data. Although most existing works are\nestablished on the premise of learning from scratch, growing efforts have been\ndevoted to incorporating the benefits of pre-training. However, how to\nadaptively exploit the pre-trained knowledge for each incremental task while\nmaintaining its generalizability remains an open question. In this work, we\npresent an extensive analysis for continual learning on a pre-trained model\n(CLPM), and attribute the key challenge to a progressive overfitting problem.\nObserving that selectively reducing the learning rate can almost resolve this\nissue in the representation layer, we propose a simple but extremely effective\napproach named Slow Learner with Classifier Alignment (SLCA), which further\nimproves the classification layer by modeling the class-wise distributions and\naligning the classification layers in a post-hoc fashion. Across a variety of\nscenarios, our proposal provides substantial improvements for CLPM (e.g., up to\n49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split\nCUB-200 and Split Cars-196, respectively), and thus outperforms\nstate-of-the-art approaches by a large margin. Based on such a strong baseline,\ncritical factors and promising directions are analyzed in-depth to facilitate\nsubsequent research.\n","authors":["Gengwei Zhang","Liyuan Wang","Guoliang Kang","Ling Chen","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2303.05118v2.pdf","comment":"11 pages, 8 figures, accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09239v1","updated":"2023-07-18T13:19:39Z","published":"2023-07-18T13:19:39Z","title":"Generation of High Spatial Resolution Terrestrial Surface from Low\n Spatial Resolution Elevation Contour Maps via Hierarchical Computation of\n Median Elevation Regions","summary":" We proposed a simple yet effective morphological approach to convert a sparse\nDigital Elevation Model (DEM) to a dense Digital Elevation Model. The\nconversion is similar to that of the generation of high-resolution DEM from its\nlow-resolution DEM. The approach involves the generation of median contours to\nachieve the purpose. It is a sequential step of the I) decomposition of the\nexisting sparse Contour map into the maximum possible Threshold Elevation\nRegion (TERs). II) Computing all possible non-negative and non-weighted Median\nElevation Region (MER) hierarchically between the successive TER decomposed\nfrom a sparse contour map. III) Computing the gradient of all TER, and MER\ncomputed from previous steps would yield the predicted intermediate elevation\ncontour at a higher spatial resolution. We presented this approach initially\nwith some self-made synthetic data to show how the contour prediction works and\nthen experimented with the available contour map of Washington, NH to justify\nits usefulness. This approach considers the geometric information of existing\ncontours and interpolates the elevation contour at a new spatial region of a\ntopographic surface until no elevation contours are necessary to generate. This\nnovel approach is also very low-cost and robust as it uses elevation contours.\n","authors":["Geetika Barman","B. S. Daya Sagar"],"pdf_url":"https://arxiv.org/pdf/2307.09239v1.pdf","comment":"11 pages, 6 figures,1 table, 1 algorithm"},{"id":"http://arxiv.org/abs/2307.09238v1","updated":"2023-07-18T13:18:52Z","published":"2023-07-18T13:18:52Z","title":"Fusing Hand and Body Skeletons for Human Action Recognition in Assembly","summary":" As collaborative robots (cobots) continue to gain popularity in industrial\nmanufacturing, effective human-robot collaboration becomes crucial. Cobots\nshould be able to recognize human actions to assist with assembly tasks and act\nautonomously. To achieve this, skeleton-based approaches are often used due to\ntheir ability to generalize across various people and environments. Although\nbody skeleton approaches are widely used for action recognition, they may not\nbe accurate enough for assembly actions where the worker's fingers and hands\nplay a significant role. To address this limitation, we propose a method in\nwhich less detailed body skeletons are combined with highly detailed hand\nskeletons. We investigate CNNs and transformers, the latter of which are\nparticularly adept at extracting and combining important information from both\nskeleton types using attention. This paper demonstrates the effectiveness of\nour proposed approach in enhancing action recognition in assembly scenarios.\n","authors":["Dustin Aganian","Mona Köhler","Benedict Stephan","Markus Eisenbach","Horst-Michael Gross"],"pdf_url":"https://arxiv.org/pdf/2307.09238v1.pdf","comment":"International Conference on Artificial Neural Networks (ICANN) 2023"},{"id":"http://arxiv.org/abs/2305.07848v3","updated":"2023-07-18T13:13:36Z","published":"2023-05-13T06:27:33Z","title":"Meta-Polyp: a baseline for efficient Polyp segmentation","summary":" In recent years, polyp segmentation has gained significant importance, and\nmany methods have been developed using CNN, Vision Transformer, and Transformer\ntechniques to achieve competitive results. However, these methods often face\ndifficulties when dealing with out-of-distribution datasets, missing\nboundaries, and small polyps. In 2022, Meta-Former was introduced as a new\nbaseline for vision, which not only improved the performance of multi-task\ncomputer vision but also addressed the limitations of the Vision Transformer\nand CNN family backbones. To further enhance segmentation, we propose a fusion\nof Meta-Former with UNet, along with the introduction of a Multi-scale\nUpsampling block with a level-up combination in the decoder stage to enhance\nthe texture, also we propose the Convformer block base on the idea of the\nMeta-former to enhance the crucial information of the local feature. These\nblocks enable the combination of global information, such as the overall shape\nof the polyp, with local information and boundary information, which is crucial\nfor the decision of the medical segmentation. Our proposed approach achieved\ncompetitive performance and obtained the top result in the State of the Art on\nthe CVC-300 dataset, Kvasir, and CVC-ColonDB dataset. Apart from Kvasir-SEG,\nothers are out-of-distribution datasets. The implementation can be found at:\nhttps://github.com/huyquoctrinh/MetaPolyp-CBMS2023.\n","authors":["Quoc-Huy Trinh"],"pdf_url":"https://arxiv.org/pdf/2305.07848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09233v1","updated":"2023-07-18T13:10:11Z","published":"2023-07-18T13:10:11Z","title":"Augmenting CLIP with Improved Visio-Linguistic Reasoning","summary":" Image-text contrastive models such as CLIP are useful for a variety of\ndownstream applications including zero-shot classification, image-text\nretrieval and transfer learning. However, these contrastively trained\nvision-language models often fail on compositional visio-linguistic tasks such\nas Winoground with performance equivalent to random chance. In our paper, we\naddress this issue and propose a sample-efficient light-weight method called\nSDS-CLIP to improve the compositional visio-linguistic reasoning capabilities\nof CLIP. The core idea of our method is to use differentiable image\nparameterizations to fine-tune CLIP with a distillation objective from large\ntext-to-image generative models such as Stable-Diffusion which are relatively\ngood at visio-linguistic reasoning tasks. On the challenging Winoground\ncompositional reasoning benchmark, our method improves the absolute\nvisio-linguistic performance of different CLIP models by up to 7%, while on the\nARO dataset, our method improves the visio-linguistic performance by upto 3%.\nAs a byproduct of inducing visio-linguistic reasoning into CLIP, we also find\nthat the zero-shot performance improves marginally on a variety of downstream\ndatasets. Our method reinforces that carefully designed distillation objectives\nfrom generative models can be leveraged to extend existing contrastive\nimage-text models with improved visio-linguistic reasoning capabilities.\n","authors":["Samyadeep Basu","Maziar Sanjabi","Daniela Massiceti","Shell Xu Hu","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2307.09233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09220v1","updated":"2023-07-18T12:52:49Z","published":"2023-07-18T12:52:49Z","title":"A Survey on Open-Vocabulary Detection and Segmentation: Past, Present,\n and Future","summary":" As the most fundamental tasks of computer vision, object detection and\nsegmentation have made tremendous progress in the deep learning era. Due to the\nexpensive manual labeling, the annotated categories in existing datasets are\noften small-scale and pre-defined, i.e., state-of-the-art detectors and\nsegmentors fail to generalize beyond the closed-vocabulary. To resolve this\nlimitation, the last few years have witnessed increasing attention toward\nOpen-Vocabulary Detection (OVD) and Segmentation (OVS). In this survey, we\nprovide a comprehensive review on the past and recent development of OVD and\nOVS. To this end, we develop a taxonomy according to the type of task and\nmethodology. We find that the permission and usage of weak supervision signals\ncan well discriminate different methodologies, including: visual-semantic space\nmapping, novel visual feature synthesis, region-aware training,\npseudo-labeling, knowledge distillation-based, and transfer learning-based. The\nproposed taxonomy is universal across different tasks, covering object\ndetection, semantic/instance/panoptic segmentation, 3D scene and video\nunderstanding. In each category, its main principles, key challenges,\ndevelopment routes, strengths, and weaknesses are thoroughly discussed. In\naddition, we benchmark each task along with the vital components of each\nmethod. Finally, several promising directions are provided to stimulate future\nresearch.\n","authors":["Chaoyang Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09184v1","updated":"2023-07-18T12:18:21Z","published":"2023-07-18T12:18:21Z","title":"You've Got Two Teachers: Co-evolutionary Image and Report Distillation\n for Semi-supervised Anatomical Abnormality Detection in Chest X-ray","summary":" Chest X-ray (CXR) anatomical abnormality detection aims at localizing and\ncharacterising cardiopulmonary radiological findings in the radiographs, which\ncan expedite clinical workflow and reduce observational oversights. Most\nexisting methods attempted this task in either fully supervised settings which\ndemanded costly mass per-abnormality annotations, or weakly supervised settings\nwhich still lagged badly behind fully supervised methods in performance. In\nthis work, we propose a co-evolutionary image and report distillation (CEIRD)\nframework, which approaches semi-supervised abnormality detection in CXR by\ngrounding the visual detection results with text-classified abnormalities from\npaired radiology reports, and vice versa. Concretely, based on the classical\nteacher-student pseudo label distillation (TSD) paradigm, we additionally\nintroduce an auxiliary report classification model, whose prediction is used\nfor report-guided pseudo detection label refinement (RPDLR) in the primary\nvision detection task. Inversely, we also use the prediction of the vision\ndetection model for abnormality-guided pseudo classification label refinement\n(APCLR) in the auxiliary report classification task, and propose a co-evolution\nstrategy where the vision and report models mutually promote each other with\nRPDLR and APCLR performed alternatively. To this end, we effectively\nincorporate the weak supervision by reports into the semi-supervised TSD\npipeline. Besides the cross-modal pseudo label refinement, we further propose\nan intra-image-modal self-adaptive non-maximum suppression, where the pseudo\ndetection labels generated by the teacher vision model are dynamically\nrectified by high-confidence predictions by the student. Experimental results\non the public MIMIC-CXR benchmark demonstrate CEIRD's superior performance to\nseveral up-to-date weakly and semi-supervised methods.\n","authors":["Jinghan Sun","Dong Wei","Zhe Xu","Donghuan Lu","Hong Liu","Liansheng Wang","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.09184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10769v2","updated":"2023-07-18T12:16:28Z","published":"2023-04-21T06:35:54Z","title":"Deep Multiview Clustering by Contrasting Cluster Assignments","summary":" Multiview clustering (MVC) aims to reveal the underlying structure of\nmultiview data by categorizing data samples into clusters. Deep learning-based\nmethods exhibit strong feature learning capabilities on large-scale datasets.\nFor most existing deep MVC methods, exploring the invariant representations of\nmultiple views is still an intractable problem. In this paper, we propose a\ncross-view contrastive learning (CVCL) method that learns view-invariant\nrepresentations and produces clustering results by contrasting the cluster\nassignments among multiple views. Specifically, we first employ deep\nautoencoders to extract view-dependent features in the pretraining stage. Then,\na cluster-level CVCL strategy is presented to explore consistent semantic label\ninformation among the multiple views in the fine-tuning stage. Thus, the\nproposed CVCL method is able to produce more discriminative cluster assignments\nby virtue of this learning strategy. Moreover, we provide a theoretical\nanalysis of soft cluster assignment alignment. Extensive experimental results\nobtained on several datasets demonstrate that the proposed CVCL method\noutperforms several state-of-the-art approaches.\n","authors":["Jie Chen","Hua Mao","Wai Lok Woo","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2304.10769v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.09183v1","updated":"2023-07-18T12:12:42Z","published":"2023-07-18T12:12:42Z","title":"Pixel-wise Graph Attention Networks for Person Re-identification","summary":" Graph convolutional networks (GCN) is widely used to handle irregular data\nsince it updates node features by using the structure information of graph.\nWith the help of iterated GCN, high-order information can be obtained to\nfurther enhance the representation of nodes. However, how to apply GCN to\nstructured data (such as pictures) has not been deeply studied. In this paper,\nwe explore the application of graph attention networks (GAT) in image feature\nextraction. First of all, we propose a novel graph generation algorithm to\nconvert images into graphs through matrix transformation. It is one magnitude\nfaster than the algorithm based on K Nearest Neighbors (KNN). Then, GAT is used\non the generated graph to update the node features. Thus, a more robust\nrepresentation is obtained. These two steps are combined into a module called\npixel-wise graph attention module (PGA). Since the graph obtained by our graph\ngeneration algorithm can still be transformed into a picture after processing,\nPGA can be well combined with CNN. Based on these two modules, we consulted the\nResNet and design a pixel-wise graph attention network (PGANet). The PGANet is\napplied to the task of person re-identification in the datasets Market1501,\nDukeMTMC-reID and Occluded-DukeMTMC (outperforms state-of-the-art by 0.8\\%,\n1.1\\% and 11\\% respectively, in mAP scores). Experiment results show that it\nachieves the state-of-the-art performance.\n\\href{https://github.com/wenyu1009/PGANet}{The code is available here}.\n","authors":["Wenyu Zhang","Qing Ding","Jian Hu","Yi Ma","Mingzhe Lu"],"pdf_url":"https://arxiv.org/pdf/2307.09183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11862v3","updated":"2023-07-18T11:56:25Z","published":"2023-04-24T07:16:54Z","title":"Universal Domain Adaptation via Compressive Attention Matching","summary":" Universal domain adaptation (UniDA) aims to transfer knowledge from the\nsource domain to the target domain without any prior knowledge about the label\nset. The challenge lies in how to determine whether the target samples belong\nto common categories. The mainstream methods make judgments based on the sample\nfeatures, which overemphasizes global information while ignoring the most\ncrucial local objects in the image, resulting in limited accuracy. To address\nthis issue, we propose a Universal Attention Matching (UniAM) framework by\nexploiting the self-attention mechanism in vision transformer to capture the\ncrucial object information. The proposed framework introduces a novel\nCompressive Attention Matching (CAM) approach to explore the core information\nby compressively representing attentions. Furthermore, CAM incorporates a\nresidual-based measurement to determine the sample commonness. By utilizing the\nmeasurement, UniAM achieves domain-wise and category-wise Common Feature\nAlignment (CFA) and Target Class Separation (TCS). Notably, UniAM is the first\nmethod utilizing the attention in vision transformer directly to perform\nclassification tasks. Extensive experiments show that UniAM outperforms the\ncurrent state-of-the-art methods on various benchmark datasets.\n","authors":["Didi Zhu","Yincuan Li","Junkun Yuan","Zexi Li","Kun Kuang","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2304.11862v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09172v1","updated":"2023-07-18T11:49:40Z","published":"2023-07-18T11:49:40Z","title":"Jean-Luc Picard at Touché 2023: Comparing Image Generation, Stance\n Detection and Feature Matching for Image Retrieval for Arguments","summary":" Participating in the shared task \"Image Retrieval for arguments\", we used\ndifferent pipelines for image retrieval containing Image Generation, Stance\nDetection, Preselection and Feature Matching. We submitted four different runs\nwith different pipeline layout and compare them to given baseline. Our\npipelines perform similarly to the baseline.\n","authors":["Max Moebius","Maximilian Enderling","Sarah T. Bachinger"],"pdf_url":"https://arxiv.org/pdf/2307.09172v1.pdf","comment":"7 pages, 1 figure, 1 table, conference: CLEF"},{"id":"http://arxiv.org/abs/2307.09165v1","updated":"2023-07-18T11:43:01Z","published":"2023-07-18T11:43:01Z","title":"Towards Trustworthy Dataset Distillation","summary":" Efficiency and trustworthiness are two eternal pursuits when applying deep\nlearning in real-world applications. With regard to efficiency, dataset\ndistillation (DD) endeavors to reduce training costs by distilling the large\ndataset into a tiny synthetic dataset. However, existing methods merely\nconcentrate on in-distribution (InD) classification in a closed-world setting,\ndisregarding out-of-distribution (OOD) samples. On the other hand, OOD\ndetection aims to enhance models' trustworthiness, which is always\ninefficiently achieved in full-data settings. For the first time, we\nsimultaneously consider both issues and propose a novel paradigm called\nTrustworthy Dataset Distillation (TrustDD). By distilling both InD samples and\noutliers, the condensed datasets are capable to train models competent in both\nInD classification and OOD detection. To alleviate the requirement of real\noutlier data and make OOD detection more practical, we further propose to\ncorrupt InD samples to generate pseudo-outliers and introduce Pseudo-Outlier\nExposure (POE). Comprehensive experiments on various settings demonstrate the\neffectiveness of TrustDD, and the proposed POE surpasses state-of-the-art\nmethod Outlier Exposure (OE). Compared with the preceding DD, TrustDD is more\ntrustworthy and applicable to real open-world scenarios. Our code will be\npublicly available.\n","authors":["Shijie Ma","Fei Zhu","Zhen Cheng","Xu-Yao Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09165v1.pdf","comment":"20 pages, 20 figures"},{"id":"http://arxiv.org/abs/2307.09161v1","updated":"2023-07-18T11:38:20Z","published":"2023-07-18T11:38:20Z","title":"CG-fusion CAM: Online segmentation of laser-induced damage on\n large-aperture optics","summary":" Online segmentation of laser-induced damage on large-aperture optics in\nhigh-power laser facilities is challenged by complicated damage morphology,\nuneven illumination and stray light interference. Fully supervised semantic\nsegmentation algorithms have achieved state-of-the-art performance, but rely on\nplenty of pixel-level labels, which are time-consuming and labor-consuming to\nproduce. LayerCAM, an advanced weakly supervised semantic segmentation\nalgorithm, can generate pixel-accurate results using only image-level labels,\nbut its scattered and partially under-activated class activation regions\ndegrade segmentation performance. In this paper, we propose a weakly supervised\nsemantic segmentation method with Continuous Gradient CAM and its nonlinear\nmulti-scale fusion (CG-fusion CAM). The method redesigns the way of\nback-propagating gradients and non-linearly activates the multi-scale fused\nheatmaps to generate more fine-grained class activation maps with appropriate\nactivation degree for different sizes of damage sites. Experiments on our\ndataset show that the proposed method can achieve segmentation performance\ncomparable to that of fully supervised algorithms.\n","authors":["Yueyue Han","Yingyan Huang","Hangcheng Dong","Fengdong Chen","Fa Zeng","Zhitao Peng","Qihua Zhu","Guodong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09160v1","updated":"2023-07-18T11:37:53Z","published":"2023-07-18T11:37:53Z","title":"Constraining Depth Map Geometry for Multi-View Stereo: A Dual-Depth\n Approach with Saddle-shaped Depth Cells","summary":" Learning-based multi-view stereo (MVS) methods deal with predicting accurate\ndepth maps to achieve an accurate and complete 3D representation. Despite the\nexcellent performance, existing methods ignore the fact that a suitable depth\ngeometry is also critical in MVS. In this paper, we demonstrate that different\ndepth geometries have significant performance gaps, even using the same depth\nprediction error. Therefore, we introduce an ideal depth geometry composed of\nSaddle-Shaped Cells, whose predicted depth map oscillates upward and downward\naround the ground-truth surface, rather than maintaining a continuous and\nsmooth depth plane. To achieve it, we develop a coarse-to-fine framework called\nDual-MVSNet (DMVSNet), which can produce an oscillating depth plane.\nTechnically, we predict two depth values for each pixel (Dual-Depth), and\npropose a novel loss function and a checkerboard-shaped selecting strategy to\nconstrain the predicted depth geometry. Compared to existing methods,DMVSNet\nachieves a high rank on the DTU benchmark and obtains the top performance on\nchallenging scenes of Tanks and Temples, demonstrating its strong performance\nand generalization ability. Our method also points to a new research direction\nfor considering depth geometry in MVS.\n","authors":["Xinyi Ye","Weiyue Zhao","Tianqi Liu","Zihao Huang","Zhiguo Cao","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2307.09160v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09158v1","updated":"2023-07-18T11:35:57Z","published":"2023-07-18T11:35:57Z","title":"Class-relation Knowledge Distillation for Novel Class Discovery","summary":" We tackle the problem of novel class discovery, which aims to learn novel\nclasses without supervision based on labeled data from known classes. A key\nchallenge lies in transferring the knowledge in the known-class data to the\nlearning of novel classes. Previous methods mainly focus on building a shared\nrepresentation space for knowledge transfer and often ignore modeling class\nrelations. To address this, we introduce a class relation representation for\nthe novel classes based on the predicted class distribution of a model trained\non known classes. Empirically, we find that such class relation becomes less\ninformative during typical discovery training. To prevent such information\nloss, we propose a novel knowledge distillation framework, which utilizes our\nclass-relation representation to regularize the learning of novel classes. In\naddition, to enable a flexible knowledge distillation scheme for each data\npoint in novel classes, we develop a learnable weighting function for the\nregularization, which adaptively promotes knowledge transfer based on the\nsemantic similarity between the novel and known classes. To validate the\neffectiveness and generalization of our method, we conduct extensive\nexperiments on multiple benchmarks, including CIFAR100, Stanford Cars, CUB, and\nFGVC-Aircraft datasets. Our results demonstrate that the proposed method\noutperforms the previous state-of-the-art methods by a significant margin on\nalmost all benchmarks. Code is available at\n\\href{https://github.com/kleinzcy/Cr-KD-NCD}{here}.\n","authors":["Peiyan Gu","Chuyu Zhang","Ruijie Xu","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2307.09158v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09155v1","updated":"2023-07-18T11:26:02Z","published":"2023-07-18T11:26:02Z","title":"MLF-DET: Multi-Level Fusion for Cross-Modal 3D Object Detection","summary":" In this paper, we propose a novel and effective Multi-Level Fusion network,\nnamed as MLF-DET, for high-performance cross-modal 3D object DETection, which\nintegrates both the feature-level fusion and decision-level fusion to fully\nutilize the information in the image. For the feature-level fusion, we present\nthe Multi-scale Voxel Image fusion (MVI) module, which densely aligns\nmulti-scale voxel features with image features. For the decision-level fusion,\nwe propose the lightweight Feature-cued Confidence Rectification (FCR) module\nwhich further exploits image semantics to rectify the confidence of detection\ncandidates. Besides, we design an effective data augmentation strategy termed\nOcclusion-aware GT Sampling (OGS) to reserve more sampled objects in the\ntraining scenes, so as to reduce overfitting. Extensive experiments on the\nKITTI dataset demonstrate the effectiveness of our method. Notably, on the\nextremely competitive KITTI car 3D object detection benchmark, our method\nreaches 82.89% moderate AP and achieves state-of-the-art performance without\nbells and whistles.\n","authors":["Zewei Lin","Yanqing Shen","Sanping Zhou","Shitao Chen","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.09155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09153v1","updated":"2023-07-18T11:24:42Z","published":"2023-07-18T11:24:42Z","title":"OPHAvatars: One-shot Photo-realistic Head Avatars","summary":" We propose a method for synthesizing photo-realistic digital avatars from\nonly one portrait as the reference. Given a portrait, our method synthesizes a\ncoarse talking head video using driving keypoints features. And with the coarse\nvideo, our method synthesizes a coarse talking head avatar with a deforming\nneural radiance field. With rendered images of the coarse avatar, our method\nupdates the low-quality images with a blind face restoration model. With\nupdated images, we retrain the avatar for higher quality. After several\niterations, our method can synthesize a photo-realistic animatable 3D neural\nhead avatar. The motivation of our method is deformable neural radiance field\ncan eliminate the unnatural distortion caused by the image2video method. Our\nmethod outperforms state-of-the-art methods in quantitative and qualitative\nstudies on various subjects.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2307.09153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09146v1","updated":"2023-07-18T10:55:54Z","published":"2023-07-18T10:55:54Z","title":"PRO-Face S: Privacy-preserving Reversible Obfuscation of Face Images via\n Secure Flow","summary":" This paper proposes a novel paradigm for facial privacy protection that\nunifies multiple characteristics including anonymity, diversity, reversibility\nand security within a single lightweight framework. We name it PRO-Face S,\nshort for Privacy-preserving Reversible Obfuscation of Face images via Secure\nflow-based model. In the framework, an Invertible Neural Network (INN) is\nutilized to process the input image along with its pre-obfuscated form, and\ngenerate the privacy protected image that visually approximates to the\npre-obfuscated one, thus ensuring privacy. The pre-obfuscation applied can be\nin diversified form with different strengths and styles specified by users.\nAlong protection, a secret key is injected into the network such that the\noriginal image can only be recovered from the protection image via the same\nmodel given the correct key provided. Two modes of image recovery are devised\nto deal with malicious recovery attempts in different scenarios. Finally,\nextensive experiments conducted on three public image datasets demonstrate the\nsuperiority of the proposed framework over multiple state-of-the-art\napproaches.\n","authors":["Lin Yuan","Kai Liang","Xiao Pu","Yan Zhang","Jiaxu Leng","Tao Wu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2307.09146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09143v1","updated":"2023-07-18T10:52:24Z","published":"2023-07-18T10:52:24Z","title":"MVA2023 Small Object Detection Challenge for Spotting Birds: Dataset,\n Methods, and Results","summary":" Small Object Detection (SOD) is an important machine vision topic because (i)\na variety of real-world applications require object detection for distant\nobjects and (ii) SOD is a challenging task due to the noisy, blurred, and\nless-informative image appearances of small objects. This paper proposes a new\nSOD dataset consisting of 39,070 images including 137,121 bird instances, which\nis called the Small Object Detection for Spotting Birds (SOD4SB) dataset. The\ndetail of the challenge with the SOD4SB dataset is introduced in this paper. In\ntotal, 223 participants joined this challenge. This paper briefly introduces\nthe award-winning methods. The dataset, the baseline code, and the website for\nevaluation on the public testset are publicly available.\n","authors":["Yuki Kondo","Norimichi Ukita","Takayuki Yamaguchi","Hao-Yu Hou","Mu-Yi Shen","Chia-Chi Hsu","En-Ming Huang","Yu-Chen Huang","Yu-Cheng Xia","Chien-Yao Wang","Chun-Yi Lee","Da Huo","Marc A. Kastner","Tingwei Liu","Yasutomo Kawanishi","Takatsugu Hirayama","Takahiro Komamizu","Ichiro Ide","Yosuke Shinya","Xinyao Liu","Guang Liang","Syusuke Yasui"],"pdf_url":"https://arxiv.org/pdf/2307.09143v1.pdf","comment":"This paper is included in the proceedings of the 18th International\n Conference on Machine Vision Applications (MVA2023). It will be officially\n published at a later date. Project page :\n https://www.mva-org.jp/mva2023/challenge"},{"id":"http://arxiv.org/abs/2307.07754v2","updated":"2023-07-18T10:36:30Z","published":"2023-07-15T09:24:45Z","title":"Bidirectionally Deformable Motion Modulation For Video-based Human Pose\n Transfer","summary":" Video-based human pose transfer is a video-to-video generation task that\nanimates a plain source human image based on a series of target human poses.\nConsidering the difficulties in transferring highly structural patterns on the\ngarments and discontinuous poses, existing methods often generate\nunsatisfactory results such as distorted textures and flickering artifacts. To\naddress these issues, we propose a novel Deformable Motion Modulation (DMM)\nthat utilizes geometric kernel offset with adaptive weight modulation to\nsimultaneously perform feature alignment and style transfer. Different from\nnormal style modulation used in style transfer, the proposed modulation\nmechanism adaptively reconstructs smoothed frames from style codes according to\nthe object shape through an irregular receptive field of view. To enhance the\nspatio-temporal consistency, we leverage bidirectional propagation to extract\nthe hidden motion information from a warped image sequence generated by noisy\nposes. The proposed feature propagation significantly enhances the motion\nprediction ability by forward and backward propagation. Both quantitative and\nqualitative experimental results demonstrate superiority over the\nstate-of-the-arts in terms of image fidelity and visual continuity. The source\ncode is publicly available at github.com/rocketappslab/bdmm.\n","authors":["Wing-Yin Yu","Lai-Man Po","Ray C. C. Cheung","Yuzhi Zhao","Yu Xue","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2307.07754v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09136v1","updated":"2023-07-18T10:34:21Z","published":"2023-07-18T10:34:21Z","title":"DropMix: Reducing Class Dependency in Mixed Sample Data Augmentation","summary":" Mixed sample data augmentation (MSDA) is a widely used technique that has\nbeen found to improve performance in a variety of tasks. However, in this\npaper, we show that the effects of MSDA are class-dependent, with some classes\nseeing an improvement in performance while others experience a decline. To\nreduce class dependency, we propose the DropMix method, which excludes a\nspecific percentage of data from the MSDA computation. By training on a\ncombination of MSDA and non-MSDA data, the proposed method not only improves\nthe performance of classes that were previously degraded by MSDA, but also\nincreases overall average accuracy, as shown in experiments on two datasets\n(CIFAR-100 and ImageNet) using three MSDA methods (Mixup, CutMix and\nPuzzleMix).\n","authors":["Haeil Lee","Hansang Lee","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.09136v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2306.15548v3","updated":"2023-07-18T10:26:58Z","published":"2023-06-27T15:18:52Z","title":"Geometric Ultrasound Localization Microscopy","summary":" Contrast-Enhanced Ultra-Sound (CEUS) has become a viable method for\nnon-invasive, dynamic visualization in medical diagnostics, yet Ultrasound\nLocalization Microscopy (ULM) has enabled a revolutionary breakthrough by\noffering ten times higher resolution. To date, Delay-And-Sum (DAS) beamformers\nare used to render ULM frames, ultimately determining the image resolution\ncapability. To take full advantage of ULM, this study questions whether\nbeamforming is the most effective processing step for ULM, suggesting an\nalternative approach that relies solely on Time-Difference-of-Arrival (TDoA)\ninformation. To this end, a novel geometric framework for micro bubble\nlocalization via ellipse intersections is proposed to overcome existing\nbeamforming limitations. We present a benchmark comparison based on a public\ndataset for which our geometric ULM outperforms existing baseline methods in\nterms of accuracy and robustness while only utilizing a portion of the\navailable transducer data.\n","authors":["Christopher Hahne","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2306.15548v3.pdf","comment":"Pre-print accepted for MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.09120v1","updated":"2023-07-18T10:07:06Z","published":"2023-07-18T10:07:06Z","title":"Light-Weight Vision Transformer with Parallel Local and Global\n Self-Attention","summary":" While transformer architectures have dominated computer vision in recent\nyears, these models cannot easily be deployed on hardware with limited\nresources for autonomous driving tasks that require real-time-performance.\nTheir computational complexity and memory requirements limits their use,\nespecially for applications with high-resolution inputs. In our work, we\nredesign the powerful state-of-the-art Vision Transformer PLG-ViT to a much\nmore compact and efficient architecture that is suitable for such tasks. We\nidentify computationally expensive blocks in the original PLG-ViT architecture\nand propose several redesigns aimed at reducing the number of parameters and\nfloating-point operations. As a result of our redesign, we are able to reduce\nPLG-ViT in size by a factor of 5, with a moderate drop in performance. We\npropose two variants, optimized for the best trade-off between parameter count\nto runtime as well as parameter count to accuracy. With only 5 million\nparameters, we achieve 79.5$\\%$ top-1 accuracy on the ImageNet-1K\nclassification benchmark. Our networks demonstrate great performance on general\nvision benchmarks like COCO instance segmentation. In addition, we conduct a\nseries of experiments, demonstrating the potential of our approach in solving\nvarious tasks specifically tailored to the challenges of autonomous driving and\ntransportation.\n","authors":["Nikolas Ebert","Laurenz Reichardt","Didier Stricker","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2307.09120v1.pdf","comment":"This paper has been accepted at IEEE Intelligent Transportation\n Systems Conference (ITSC), 2023"},{"id":"http://arxiv.org/abs/2307.09112v1","updated":"2023-07-18T10:02:09Z","published":"2023-07-18T10:02:09Z","title":"NU-MCC: Multiview Compressive Coding with Neighborhood Decoder and\n Repulsive UDF","summary":" Remarkable progress has been made in 3D reconstruction from single-view RGB-D\ninputs. MCC is the current state-of-the-art method in this field, which\nachieves unprecedented success by combining vision Transformers with\nlarge-scale training. However, we identified two key limitations of MCC: 1) The\nTransformer decoder is inefficient in handling large number of query points; 2)\nThe 3D representation struggles to recover high-fidelity details. In this\npaper, we propose a new approach called NU-MCC that addresses these\nlimitations. NU-MCC includes two key innovations: a Neighborhood decoder and a\nRepulsive Unsigned Distance Function (Repulsive UDF). First, our Neighborhood\ndecoder introduces center points as an efficient proxy of input visual\nfeatures, allowing each query point to only attend to a small neighborhood.\nThis design not only results in much faster inference speed but also enables\nthe exploitation of finer-scale visual features for improved recovery of 3D\ntextures. Second, our Repulsive UDF is a novel alternative to the occupancy\nfield used in MCC, significantly improving the quality of 3D object\nreconstruction. Compared to standard UDFs that suffer from holes in results,\nour proposed Repulsive UDF can achieve more complete surface reconstruction.\nExperimental results demonstrate that NU-MCC is able to learn a strong 3D\nrepresentation, significantly advancing the state of the art in single-view 3D\nreconstruction. Particularly, it outperforms MCC by 9.7% in terms of the\nF1-score on the CO3D-v2 dataset with more than 5x faster running speed.\n","authors":["Stefan Lionar","Xiangyu Xu","Min Lin","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2307.09112v1.pdf","comment":"Project page: https://numcc.github.io/"},{"id":"http://arxiv.org/abs/2307.09109v1","updated":"2023-07-18T09:58:15Z","published":"2023-07-18T09:58:15Z","title":"Mining of Single-Class by Active Learning for Semantic Segmentation","summary":" Several Active Learning (AL) policies require retraining a target model\nseveral times in order to identify the most informative samples and rarely\noffer the option to focus on the acquisition of samples from underrepresented\nclasses. Here the Mining of Single-Class by Active Learning (MiSiCAL) paradigm\nis introduced where an AL policy is constructed through deep reinforcement\nlearning and exploits quantity-accuracy correlations to build datasets on which\nhigh-performance models can be trained with regards to specific classes.\nMiSiCAL is especially helpful in the case of very large batch sizes since it\ndoes not require repeated model training sessions as is common in other AL\nmethods. This is thanks to its ability to exploit fixed representations of the\ncandidate data points. We find that MiSiCAL is able to outperform a random\npolicy on 150 out of 171 COCO10k classes, while the strongest baseline only\noutperforms random on 101 classes.\n","authors":["Hugues Lambert","Emma Slade"],"pdf_url":"https://arxiv.org/pdf/2307.09109v1.pdf","comment":"29 pages, 14 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.09104v1","updated":"2023-07-18T09:52:48Z","published":"2023-07-18T09:52:48Z","title":"Division Gets Better: Learning Brightness-Aware and Detail-Sensitive\n Representations for Low-Light Image Enhancement","summary":" Low-light image enhancement strives to improve the contrast, adjust the\nvisibility, and restore the distortion in color and texture. Existing methods\nusually pay more attention to improving the visibility and contrast via\nincreasing the lightness of low-light images, while disregarding the\nsignificance of color and texture restoration for high-quality images. Against\nabove issue, we propose a novel luminance and chrominance dual branch network,\ntermed LCDBNet, for low-light image enhancement, which divides low-light image\nenhancement into two sub-tasks, e.g., luminance adjustment and chrominance\nrestoration. Specifically, LCDBNet is composed of two branches, namely\nluminance adjustment network (LAN) and chrominance restoration network (CRN).\nLAN takes responsibility for learning brightness-aware features leveraging\nlong-range dependency and local attention correlation. While CRN concentrates\non learning detail-sensitive features via multi-level wavelet decomposition.\nFinally, a fusion network is designed to blend their learned features to\nproduce visually impressive images. Extensive experiments conducted on seven\nbenchmark datasets validate the effectiveness of our proposed LCDBNet, and the\nresults manifest that LCDBNet achieves superior performance in terms of\nmultiple reference/non-reference quality evaluators compared to other\nstate-of-the-art competitors. Our code and pretrained model will be available.\n","authors":["Huake Wang","Xiaoyang Yan","Xingsong Hou","Junhui Li","Yujie Dun","Kaibing Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09104v1.pdf","comment":"14 pages, 16 figures"},{"id":"http://arxiv.org/abs/2307.09099v1","updated":"2023-07-18T09:42:51Z","published":"2023-07-18T09:42:51Z","title":"A Survey on Multi-Objective Neural Architecture Search","summary":" Recently, the expert-crafted neural architectures is increasing overtaken by\nthe utilization of neural architecture search (NAS) and automatic generation\n(and tuning) of network structures which has a close relation to the\nHyperparameter Optimization and Auto Machine Learning (AutoML). After the\nearlier NAS attempts to optimize only the prediction accuracy, Multi-Objective\nNeural architecture Search (MONAS) has been attracting attentions which\nconsiders more goals such as computational complexity, power consumption, and\nsize of the network for optimization, reaching a trade-off between the accuracy\nand other features like the computational cost. In this paper, we present an\noverview of principal and state-of-the-art works in the field of MONAS.\nStarting from a well-categorized taxonomy and formulation for the NAS, we\naddress and correct some miscategorizations in previous surveys of the NAS\nfield. We also provide a list of all known objectives used and add a number of\nnew ones and elaborate their specifications. We have provides analyses about\nthe most important objectives and shown that the stochastic properties of some\nthe them should be differed from deterministic ones in the multi-objective\noptimization procedure of NAS. We finalize this paper with a number of future\ndirections and topics in the field of MONAS.\n","authors":["Seyed Mahdi Shariatzadeh","Mahmood Fathy","Reza Berangi","Mohammad Shahverdy"],"pdf_url":"https://arxiv.org/pdf/2307.09099v1.pdf","comment":"22 pages, 10 figures, 9 tables"},{"id":"http://arxiv.org/abs/2307.07483v2","updated":"2023-07-18T09:26:57Z","published":"2023-07-14T17:07:32Z","title":"Multimodal Distillation for Egocentric Action Recognition","summary":" The focal point of egocentric video understanding is modelling hand-object\ninteractions. Standard models, e.g. CNNs or Vision Transformers, which receive\nRGB frames as input perform well. However, their performance improves further\nby employing additional input modalities that provide complementary cues, such\nas object detections, optical flow, audio, etc. The added complexity of the\nmodality-specific modules, on the other hand, makes these models impractical\nfor deployment. The goal of this work is to retain the performance of such a\nmultimodal approach, while using only the RGB frames as input at inference\ntime. We demonstrate that for egocentric action recognition on the\nEpic-Kitchens and the Something-Something datasets, students which are taught\nby multimodal teachers tend to be more accurate and better calibrated than\narchitecturally equivalent models trained on ground truth labels in a unimodal\nor multimodal fashion. We further adopt a principled multimodal knowledge\ndistillation framework, allowing us to deal with issues which occur when\napplying multimodal knowledge distillation in a naive manner. Lastly, we\ndemonstrate the achieved reduction in computational complexity, and show that\nour approach maintains higher performance with the reduction of the number of\ninput views. We release our code at\nhttps://github.com/gorjanradevski/multimodal-distillation.\n","authors":["Gorjan Radevski","Dusan Grujicic","Marie-Francine Moens","Matthew Blaschko","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2307.07483v2.pdf","comment":"Accepted at ICCV 2023; Codebase released at\n https://github.com/gorjanradevski/multimodal-distillation"},{"id":"http://arxiv.org/abs/2305.09211v2","updated":"2023-07-18T09:21:27Z","published":"2023-05-16T06:40:04Z","title":"CB-HVTNet: A channel-boosted hybrid vision transformer network for\n lymphocyte assessment in histopathological images","summary":" Transformers, due to their ability to learn long range dependencies, have\novercome the shortcomings of convolutional neural networks (CNNs) for global\nperspective learning. Therefore, they have gained the focus of researchers for\nseveral vision related tasks including medical diagnosis. However, their\nmulti-head attention module only captures global level feature representations,\nwhich is insufficient for medical images. To address this issue, we propose a\nChannel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning\nto generate boosted channels and employs both transformers and CNNs to analyse\nlymphocytes in histopathological images. The proposed CB HVT comprises five\nmodules, including a channel generation module, channel exploitation module,\nchannel merging module, region-aware module, and a detection and segmentation\nhead, which work together to effectively identify lymphocytes. The channel\ngeneration module uses the idea of channel boosting through transfer learning\nto extract diverse channels from different auxiliary learners. In the CB HVT,\nthese boosted channels are first concatenated and ranked using an attention\nmechanism in the channel exploitation module. A fusion block is then utilized\nin the channel merging module for a gradual and systematic merging of the\ndiverse boosted channels to improve the network's learning representations. The\nCB HVT also employs a proposal network in its region aware module and a head to\neffectively identify objects, even in overlapping regions and with artifacts.\nWe evaluated the proposed CB HVT on two publicly available datasets for\nlymphocyte assessment in histopathological images. The results show that CB HVT\noutperformed other state of the art detection models, and has good\ngeneralization ability, demonstrating its value as a tool for pathologists.\n","authors":["Momina Liaqat Ali","Zunaira Rauf","Asifullah Khan","Anabia Sohail","Rafi Ullah","Jeonghwan Gwak"],"pdf_url":"https://arxiv.org/pdf/2305.09211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08466v2","updated":"2023-07-18T09:16:10Z","published":"2023-07-17T13:21:02Z","title":"Generalizable Classification of UHF Partial Discharge Signals in\n Gas-Insulated HVDC Systems Using Neural Networks","summary":" Undetected partial discharges (PDs) are a safety critical issue in high\nvoltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC\nvoltage is well-established, the analysis of PDs under DC voltage remains an\nactive research field. A key focus of these investigations is the\nclassification of different PD sources to enable subsequent sophisticated\nanalysis.\n In this paper, we propose and analyze a neural network-based approach for\nclassifying PD signals caused by metallic protrusions and conductive particles\non the insulator of HVDC GIS, without relying on pulse sequence analysis\nfeatures. In contrast to previous approaches, our proposed model can\ndiscriminate the studied PD signals obtained at negative and positive\npotentials, while also generalizing to unseen operating voltage multiples.\nAdditionally, we compare the performance of time- and frequency-domain input\nsignals and explore the impact of different normalization schemes to mitigate\nthe influence of free-space path loss between the sensor and defect location.\n","authors":["Steffen Seitz","Thomas Götz","Christopher Lindenberg","Ronald Tetzlaff","Stephan Schlegel"],"pdf_url":"https://arxiv.org/pdf/2307.08466v2.pdf","comment":"8 pages, submitted to IEEE Transactions on Power Delivery"},{"id":"http://arxiv.org/abs/2307.02347v2","updated":"2023-07-18T09:09:02Z","published":"2023-07-05T15:03:10Z","title":"Detecting Images Generated by Deep Diffusion Models using their Local\n Intrinsic Dimensionality","summary":" Diffusion models recently have been successfully applied for the visual\nsynthesis of strikingly realistic appearing images. This raises strong concerns\nabout their potential for malicious purposes. In this paper, we propose using\nthe lightweight multi Local Intrinsic Dimensionality (multiLID), which has been\noriginally developed in context of the detection of adversarial examples, for\nthe automatic detection of synthetic images and the identification of the\naccording generator networks. In contrast to many existing detection\napproaches, which often only work for GAN-generated images, the proposed method\nprovides close to perfect detection results in many realistic use cases.\nExtensive experiments on known and newly created datasets demonstrate that the\nproposed multiLID approach exhibits superiority in diffusion detection and\nmodel identification. Since the empirical evaluations of recent publications on\nthe detection of generated images are often mainly focused on the\n\"LSUN-Bedroom\" dataset, we further establish a comprehensive benchmark for the\ndetection of diffusion-generated images, including samples from several\ndiffusion models with different image sizes.\n","authors":["Peter Lorenz","Ricard Durall","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.02347v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09070v1","updated":"2023-07-18T08:41:17Z","published":"2023-07-18T08:41:17Z","title":"PixelHuman: Animatable Neural Radiance Fields from Few Images","summary":" In this paper, we propose PixelHuman, a novel human rendering model that\ngenerates animatable human scenes from a few images of a person with unseen\nidentity, views, and poses. Previous work have demonstrated reasonable\nperformance in novel view and pose synthesis, but they rely on a large number\nof images to train and are trained per scene from videos, which requires\nsignificant amount of time to produce animatable scenes from unseen human\nimages. Our method differs from existing methods in that it can generalize to\nany input image for animatable human synthesis. Given a random pose sequence,\nour method synthesizes each target scene using a neural radiance field that is\nconditioned on a canonical representation and pose-aware pixel-aligned\nfeatures, both of which can be obtained through deformation fields learned in a\ndata-driven manner. Our experiments show that our method achieves\nstate-of-the-art performance in multiview and novel pose synthesis from\nfew-shot images.\n","authors":["Gyumin Shim","Jaeseong Lee","Junha Hyung","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2307.09070v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.09067v1","updated":"2023-07-18T08:37:58Z","published":"2023-07-18T08:37:58Z","title":"Evaluate Fine-tuning Strategies for Fetal Head Ultrasound Image\n Segmentation with U-Net","summary":" Fetal head segmentation is a crucial step in measuring the fetal head\ncircumference (HC) during gestation, an important biometric in obstetrics for\nmonitoring fetal growth. However, manual biometry generation is time-consuming\nand results in inconsistent accuracy. To address this issue, convolutional\nneural network (CNN) models have been utilized to improve the efficiency of\nmedical biometry. But training a CNN network from scratch is a challenging\ntask, we proposed a Transfer Learning (TL) method. Our approach involves\nfine-tuning (FT) a U-Net network with a lightweight MobileNet as the encoder to\nperform segmentation on a set of fetal head ultrasound (US) images with limited\neffort. This method addresses the challenges associated with training a CNN\nnetwork from scratch. It suggests that our proposed FT strategy yields\nsegmentation performance that is comparable when trained with a reduced number\nof parameters by 85.8%. And our proposed FT strategy outperforms other\nstrategies with smaller trainable parameter sizes below 4.4 million. Thus, we\ncontend that it can serve as a dependable FT approach for reducing the size of\nmodels in medical image analysis. Our key findings highlight the importance of\nthe balance between model performance and size in developing Artificial\nIntelligence (AI) applications by TL methods. Code is available at\nhttps://github.com/13204942/FT_Methods_for_Fetal_Head_Segmentation.\n","authors":["Fangyijie Wang","Guénolé Silvestre","Kathleen M. Curran"],"pdf_url":"https://arxiv.org/pdf/2307.09067v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.09066v1","updated":"2023-07-18T08:37:37Z","published":"2023-07-18T08:37:37Z","title":"PatchCT: Aligning Patch Set and Label Set with Conditional Transport for\n Multi-Label Image Classification","summary":" Multi-label image classification is a prediction task that aims to identify\nmore than one label from a given image. This paper considers the semantic\nconsistency of the latent space between the visual patch and linguistic label\ndomains and introduces the conditional transport (CT) theory to bridge the\nacknowledged gap. While recent cross-modal attention-based studies have\nattempted to align such two representations and achieved impressive\nperformance, they required carefully-designed alignment modules and extra\ncomplex operations in the attention computation. We find that by formulating\nthe multi-label classification as a CT problem, we can exploit the interactions\nbetween the image and label efficiently by minimizing the bidirectional CT\ncost. Specifically, after feeding the images and textual labels into the\nmodality-specific encoders, we view each image as a mixture of patch embeddings\nand a mixture of label embeddings, which capture the local region features and\nthe class prototypes, respectively. CT is then employed to learn and align\nthose two semantic sets by defining the forward and backward navigators.\nImportantly, the defined navigators in CT distance model the similarities\nbetween patches and labels, which provides an interpretable tool to visualize\nthe learned prototypes. Extensive experiments on three public image benchmarks\nshow that the proposed model consistently outperforms the previous methods. Our\ncode is available at https://github.com/keepgoingjkg/PatchCT.\n","authors":["Miaoge Li","Dongsheng Wang","Xinyang Liu","Zequn Zeng","Ruiying Lu","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.09066v1.pdf","comment":"accepted by ICCV23"},{"id":"http://arxiv.org/abs/2307.09065v1","updated":"2023-07-18T08:37:25Z","published":"2023-07-18T08:37:25Z","title":"Learning Adaptive Neighborhoods for Graph Neural Networks","summary":" Graph convolutional networks (GCNs) enable end-to-end learning on graph\nstructured data. However, many works assume a given graph structure. When the\ninput graph is noisy or unavailable, one approach is to construct or learn a\nlatent graph structure. These methods typically fix the choice of node degree\nfor the entire graph, which is suboptimal. Instead, we propose a novel\nend-to-end differentiable graph generator which builds graph topologies where\neach node selects both its neighborhood and its size. Our module can be readily\nintegrated into existing pipelines involving graph convolution operations,\nreplacing the predetermined or existing adjacency matrix with one that is\nlearned, and optimized, as part of the general objective. As such it is\napplicable to any GCN. We integrate our module into trajectory prediction,\npoint cloud classification and node classification pipelines resulting in\nimproved accuracy over other structure-learning methods across a wide range of\ndatasets and GCN backbones.\n","authors":["Avishkar Saha","Oscar Mendez","Chris Russell","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2307.09065v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09059v1","updated":"2023-07-18T08:23:46Z","published":"2023-07-18T08:23:46Z","title":"Unleashing the Imagination of Text: A Novel Framework for Text-to-image\n Person Retrieval via Exploring the Power of Words","summary":" The goal of Text-to-image person retrieval is to retrieve person images from\na large gallery that match the given textual descriptions. The main challenge\nof this task lies in the significant differences in information representation\nbetween the visual and textual modalities. The textual modality conveys\nabstract and precise information through vocabulary and grammatical structures,\nwhile the visual modality conveys concrete and intuitive information through\nimages. To fully leverage the expressive power of textual representations, it\nis essential to accurately map abstract textual descriptions to specific\nimages.\n To address this issue, we propose a novel framework to Unleash the\nImagination of Text (UIT) in text-to-image person retrieval, aiming to fully\nexplore the power of words in sentences. Specifically, the framework employs\nthe pre-trained full CLIP model as a dual encoder for the images and texts ,\ntaking advantage of prior cross-modal alignment knowledge. The Text-guided\nImage Restoration auxiliary task is proposed with the aim of implicitly mapping\nabstract textual entities to specific image regions, facilitating alignment\nbetween textual and visual embeddings. Additionally, we introduce a cross-modal\ntriplet loss tailored for handling hard samples, enhancing the model's ability\nto distinguish minor differences.\n To focus the model on the key components within sentences, we propose a novel\ntext data augmentation technique. Our proposed methods achieve state-of-the-art\nresults on three popular benchmark datasets, and the source code will be made\npublicly available shortly.\n","authors":["Delong Liu","Haiwen Li"],"pdf_url":"https://arxiv.org/pdf/2307.09059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09055v1","updated":"2023-07-18T08:11:08Z","published":"2023-07-18T08:11:08Z","title":"Outlier-Robust Tensor Low-Rank Representation for Data Clustering","summary":" Low-rank tensor analysis has received widespread attention with many\npractical applications. However, the tensor data are often contaminated by\noutliers or sample-specific corruptions. How to recover the tensor data that\nare corrupted by outliers and perform data clustering remains a challenging\nproblem. This paper develops an outlier-robust tensor low-rank representation\n(OR-TLRR) method for simultaneous outlier detection and tensor data clustering\nbased on the tensor singular value decomposition (t-SVD) algebraic framework.\nIt is motivated by the recently proposed tensor-tensor product induced by\ninvertible linear transforms that satisfy certain conditions. For tensor\nobservations with arbitrary outlier corruptions, OR-TLRR has provable\nperformance guarantee for exactly recovering the row space of clean data and\ndetecting outliers under mild conditions. Moreover, an extension of OR-TLRR is\nalso proposed to handle the case when parts of the data are missing. Finally,\nextensive experimental results on both synthetic and real data demonstrate the\neffectiveness of the proposed algorithms.\n","authors":["Tong Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09055v1.pdf","comment":"12 pages, 1 figure; preprint of a journal paper"},{"id":"http://arxiv.org/abs/2307.09052v1","updated":"2023-07-18T08:06:14Z","published":"2023-07-18T08:06:14Z","title":"Connections between Operator-splitting Methods and Deep Neural Networks\n with Applications in Image Segmentation","summary":" Deep neural network is a powerful tool for many tasks. Understanding why it\nis so successful and providing a mathematical explanation is an important\nproblem and has been one popular research direction in past years. In the\nliterature of mathematical analysis of deep deep neural networks, a lot of\nworks are dedicated to establishing representation theories. How to make\nconnections between deep neural networks and mathematical algorithms is still\nunder development. In this paper, we give an algorithmic explanation for deep\nneural networks, especially in their connection with operator splitting and\nmultigrid methods. We show that with certain splitting strategies,\noperator-splitting methods have the same structure as networks. Utilizing this\nconnection and the Potts model for image segmentation, two networks inspired by\noperator-splitting methods are proposed. The two networks are essentially two\noperator-splitting algorithms solving the Potts model. Numerical experiments\nare presented to demonstrate the effectiveness of the proposed networks.\n","authors":["Hao Liu","Xue-Cheng Tai","Raymond Chan"],"pdf_url":"https://arxiv.org/pdf/2307.09052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09050v1","updated":"2023-07-18T08:03:51Z","published":"2023-07-18T08:03:51Z","title":"R-Cut: Enhancing Explainability in Vision Transformers with Relationship\n Weighted Out and Cut","summary":" Transformer-based models have gained popularity in the field of natural\nlanguage processing (NLP) and are extensively utilized in computer vision tasks\nand multi-modal models such as GPT4. This paper presents a novel method to\nenhance the explainability of Transformer-based image classification models.\nOur method aims to improve trust in classification results and empower users to\ngain a deeper understanding of the model for downstream tasks by providing\nvisualizations of class-specific maps. We introduce two modules: the\n``Relationship Weighted Out\" and the ``Cut\" modules. The ``Relationship\nWeighted Out\" module focuses on extracting class-specific information from\nintermediate layers, enabling us to highlight relevant features. Additionally,\nthe ``Cut\" module performs fine-grained feature decomposition, taking into\naccount factors such as position, texture, and color. By integrating these\nmodules, we generate dense class-specific visual explainability maps. We\nvalidate our method with extensive qualitative and quantitative experiments on\nthe ImageNet dataset. Furthermore, we conduct a large number of experiments on\nthe LRN dataset, specifically designed for automatic driving danger alerts, to\nevaluate the explainability of our method in complex backgrounds. The results\ndemonstrate a significant improvement over previous methods. Moreover, we\nconduct ablation experiments to validate the effectiveness of each module.\nThrough these experiments, we are able to confirm the respective contributions\nof each module, thus solidifying the overall effectiveness of our proposed\napproach.\n","authors":["Yingjie Niu","Ming Ding","Maoning Ge","Robin Karlsson","Yuxiao Zhang","Kazuya Takeda"],"pdf_url":"https://arxiv.org/pdf/2307.09050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09039v1","updated":"2023-07-18T07:48:48Z","published":"2023-07-18T07:48:48Z","title":"PottsMGNet: A Mathematical Explanation of Encoder-Decoder Based Neural\n Networks","summary":" For problems in image processing and many other fields, a large class of\neffective neural networks has encoder-decoder-based architectures. Although\nthese networks have made impressive performances, mathematical explanations of\ntheir architectures are still underdeveloped. In this paper, we study the\nencoder-decoder-based network architecture from the algorithmic perspective and\nprovide a mathematical explanation. We use the two-phase Potts model for image\nsegmentation as an example for our explanations. We associate the segmentation\nproblem with a control problem in the continuous setting. Then, multigrid\nmethod and operator splitting scheme, the PottsMGNet, are used to discretize\nthe continuous control model. We show that the resulting discrete PottsMGNet is\nequivalent to an encoder-decoder-based network. With minor modifications, it is\nshown that a number of the popular encoder-decoder-based neural networks are\njust instances of the proposed PottsMGNet. By incorporating the\nSoft-Threshold-Dynamics into the PottsMGNet as a regularizer, the PottsMGNet\nhas shown to be robust with the network parameters such as network width and\ndepth and achieved remarkable performance on datasets with very large noise. In\nnearly all our experiments, the new network always performs better or as good\non accuracy and dice score than existing networks for image segmentation.\n","authors":["Xue-Cheng Tai","Hao Liu","Raymond Chan"],"pdf_url":"https://arxiv.org/pdf/2307.09039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09027v1","updated":"2023-07-18T07:35:28Z","published":"2023-07-18T07:35:28Z","title":"Online Self-Supervised Thermal Water Segmentation for Aerial Vehicles","summary":" We present a new method to adapt an RGB-trained water segmentation network to\ntarget-domain aerial thermal imagery using online self-supervision by\nleveraging texture and motion cues as supervisory signals. This new thermal\ncapability enables current autonomous aerial robots operating in near-shore\nenvironments to perform tasks such as visual navigation, bathymetry, and flow\ntracking at night. Our method overcomes the problem of scarce and\ndifficult-to-obtain near-shore thermal data that prevents the application of\nconventional supervised and unsupervised methods. In this work, we curate the\nfirst aerial thermal near-shore dataset, show that our approach outperforms\nfully-supervised segmentation models trained on limited target-domain thermal\ndata, and demonstrate real-time capabilities onboard an Nvidia Jetson embedded\ncomputing platform. Code and datasets used in this work will be available at:\nhttps://github.com/connorlee77/uav-thermal-water-segmentation.\n","authors":["Connor Lee","Jonathan Gustafsson Frennert","Lu Gan","Matthew Anderson","Soon-Jo Chung"],"pdf_url":"https://arxiv.org/pdf/2307.09027v1.pdf","comment":"8 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.09026v1","updated":"2023-07-18T07:34:04Z","published":"2023-07-18T07:34:04Z","title":"ActionPrompt: Action-Guided 3D Human Pose Estimation With Text and Pose\n Prompting","summary":" Recent 2D-to-3D human pose estimation (HPE) utilizes temporal consistency\nacross sequences to alleviate the depth ambiguity problem but ignore the action\nrelated prior knowledge hidden in the pose sequence. In this paper, we propose\na plug-and-play module named Action Prompt Module (APM) that effectively mines\ndifferent kinds of action clues for 3D HPE. The highlight is that, the mining\nscheme of APM can be widely adapted to different frameworks and bring\nconsistent benefits. Specifically, we first present a novel Action-related Text\nPrompt module (ATP) that directly embeds action labels and transfers the rich\nlanguage information in the label to the pose sequence. Besides, we further\nintroduce Action-specific Pose Prompt module (APP) to mine the position-aware\npose pattern of each action, and exploit the correlation between the mined\npatterns and input pose sequence for further pose refinement. Experiments show\nthat APM can improve the performance of most video-based 2D-to-3D HPE\nframeworks by a large margin.\n","authors":["Hongwei Zheng","Han Li","Bowen Shi","Wenrui Dai","Botao Wan","Yu Sun","Min Guo","Hongkai Xiong"],"pdf_url":"https://arxiv.org/pdf/2307.09026v1.pdf","comment":"6 pages, 4 figures, 2023ICME"},{"id":"http://arxiv.org/abs/2307.07250v2","updated":"2023-07-18T07:31:34Z","published":"2023-07-14T09:51:26Z","title":"Mitigating Adversarial Vulnerability through Causal Parameter Estimation\n by Adversarial Double Machine Learning","summary":" Adversarial examples derived from deliberately crafted perturbations on\nvisual inputs can easily harm decision process of deep neural networks. To\nprevent potential threats, various adversarial training-based defense methods\nhave grown rapidly and become a de facto standard approach for robustness.\nDespite recent competitive achievements, we observe that adversarial\nvulnerability varies across targets and certain vulnerabilities remain\nprevalent. Intriguingly, such peculiar phenomenon cannot be relieved even with\ndeeper architectures and advanced defense methods. To address this issue, in\nthis paper, we introduce a causal approach called Adversarial Double Machine\nLearning (ADML), which allows us to quantify the degree of adversarial\nvulnerability for network predictions and capture the effect of treatments on\noutcome of interests. ADML can directly estimate causal parameter of\nadversarial perturbations per se and mitigate negative effects that can\npotentially damage robustness, bridging a causal perspective into the\nadversarial vulnerability. Through extensive experiments on various CNN and\nTransformer architectures, we corroborate that ADML improves adversarial\nrobustness with large margins and relieve the empirical observation.\n","authors":["Byung-Kwan Lee","Junho Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2307.07250v2.pdf","comment":"Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09023v1","updated":"2023-07-18T07:25:38Z","published":"2023-07-18T07:25:38Z","title":"LA-Net: Landmark-Aware Learning for Reliable Facial Expression\n Recognition under Label Noise","summary":" Facial expression recognition (FER) remains a challenging task due to the\nambiguity of expressions. The derived noisy labels significantly harm the\nperformance in real-world scenarios. To address this issue, we present a new\nFER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks\nto mitigate the impact of label noise from two perspectives. Firstly, LA-Net\nuses landmark information to suppress the uncertainty in expression space and\nconstructs the label distribution of each sample by neighborhood aggregation,\nwhich in turn improves the quality of training supervision. Secondly, the model\nincorporates landmark information into expression representations using the\ndevised expression-landmark contrastive loss. The enhanced expression feature\nextractor can be less susceptible to label noise. Our method can be integrated\nwith any deep neural network for better training supervision without\nintroducing extra inference costs. We conduct extensive experiments on both\nin-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net\nachieves state-of-the-art performance.\n","authors":["Zhiyu Wu","Jinshi Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09023v1.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09020v1","updated":"2023-07-18T07:20:31Z","published":"2023-07-18T07:20:31Z","title":"Face-PAST: Facial Pose Awareness and Style Transfer Networks","summary":" Facial style transfer has been quite popular among researchers due to the\nrise of emerging technologies such as eXtended Reality (XR), Metaverse, and\nNon-Fungible Tokens (NFTs). Furthermore, StyleGAN methods along with\ntransfer-learning strategies have reduced the problem of limited data to some\nextent. However, most of the StyleGAN methods overfit the styles while adding\nartifacts to facial images. In this paper, we propose a facial pose awareness\nand style transfer (Face-PAST) network that preserves facial details and\nstructures while generating high-quality stylized images. Dual StyleGAN\ninspires our work, but in contrast, our work uses a pre-trained style\ngeneration network in an external style pass with a residual modulation block\ninstead of a transform coding block. Furthermore, we use the gated mapping unit\nand facial structure, identity, and segmentation losses to preserve the facial\nstructure and details. This enables us to train the network with a very limited\namount of data while generating high-quality stylized images. Our training\nprocess adapts curriculum learning strategy to perform efficient and flexible\nstyle mixing in the generative space. We perform extensive experiments to show\nthe superiority of Face-PAST in comparison to existing state-of-the-art\nmethods.\n","authors":["Sunder Ali Khowaja","Ghulam Mujtaba","Jiseok Yoon","Ik Hyun Lee"],"pdf_url":"https://arxiv.org/pdf/2307.09020v1.pdf","comment":"20 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.09019v1","updated":"2023-07-18T07:15:26Z","published":"2023-07-18T07:15:26Z","title":"U-shaped Transformer: Retain High Frequency Context in Time Series\n Analysis","summary":" Time series prediction plays a crucial role in various industrial fields. In\nrecent years, neural networks with a transformer backbone have achieved\nremarkable success in many domains, including computer vision and NLP. In time\nseries analysis domain, some studies have suggested that even the simplest MLP\nnetworks outperform advanced transformer-based networks on time series forecast\ntasks. However, we believe these findings indicate there to be low-rank\nproperties in time series sequences. In this paper, we consider the low-pass\ncharacteristics of transformers and try to incorporate the advantages of MLP.\nWe adopt skip-layer connections inspired by Unet into traditional transformer\nbackbone, thus preserving high-frequency context from input to output, namely\nU-shaped Transformer. We introduce patch merge and split operation to extract\nfeatures with different scales and use larger datasets to fully make use of the\ntransformer backbone. Our experiments demonstrate that the model performs at an\nadvanced level across multiple datasets with relatively low cost.\n","authors":["Qingkui Chen","Yiqin Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08101v2","updated":"2023-07-18T07:00:03Z","published":"2023-04-17T09:22:05Z","title":"LLA-FLOW: A Lightweight Local Aggregation on Cost Volume for Optical\n Flow Estimation","summary":" Lack of texture often causes ambiguity in matching, and handling this issue\nis an important challenge in optical flow estimation. Some methods insert\nstacked transformer modules that allow the network to use global information of\ncost volume for estimation. But the global information aggregation often incurs\nserious memory and time costs during training and inference, which hinders\nmodel deployment. We draw inspiration from the traditional local region\nconstraint and design the local similarity aggregation (LSA) and the shifted\nlocal similarity aggregation (SLSA). The aggregation for cost volume is\nimplemented with lightweight modules that act on the feature maps. Experiments\non the final pass of Sintel show the lower cost required for our approach while\nmaintaining competitive performance.\n","authors":["Jiawei Xu","Zongqing Lu","Qingmin Liao"],"pdf_url":"https://arxiv.org/pdf/2304.08101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08397v2","updated":"2023-07-18T06:58:39Z","published":"2023-07-17T11:29:48Z","title":"CLIP-Guided StyleGAN Inversion for Text-Driven Real Image Editing","summary":" Researchers have recently begun exploring the use of StyleGAN-based models\nfor real image editing. One particularly interesting application is using\nnatural language descriptions to guide the editing process. Existing approaches\nfor editing images using language either resort to instance-level latent code\noptimization or map predefined text prompts to some editing directions in the\nlatent space. However, these approaches have inherent limitations. The former\nis not very efficient, while the latter often struggles to effectively handle\nmulti-attribute changes. To address these weaknesses, we present CLIPInverter,\na new text-driven image editing approach that is able to efficiently and\nreliably perform multi-attribute changes. The core of our method is the use of\nnovel, lightweight text-conditioned adapter layers integrated into pretrained\nGAN-inversion networks. We demonstrate that by conditioning the initial\ninversion step on the CLIP embedding of the target description, we are able to\nobtain more successful edit directions. Additionally, we use a CLIP-guided\nrefinement step to make corrections in the resulting residual latent codes,\nwhich further improves the alignment with the text prompt. Our method\noutperforms competing approaches in terms of manipulation accuracy and\nphoto-realism on various domains including human faces, cats, and birds, as\nshown by our qualitative and quantitative results.\n","authors":["Ahmet Canberk Baykal","Abdul Basit Anees","Duygu Ceylan","Erkut Erdem","Aykut Erdem","Deniz Yuret"],"pdf_url":"https://arxiv.org/pdf/2307.08397v2.pdf","comment":"Accepted for publication in ACM Transactions on Graphics"},{"id":"http://arxiv.org/abs/2207.03824v3","updated":"2023-07-18T06:57:23Z","published":"2022-07-08T11:05:35Z","title":"Boosting Zero-shot Learning via Contrastive Optimization of Attribute\n Representations","summary":" Zero-shot learning (ZSL) aims to recognize classes that do not have samples\nin the training set. One representative solution is to directly learn an\nembedding function associating visual features with corresponding class\nsemantics for recognizing new classes. Many methods extend upon this solution,\nand recent ones are especially keen on extracting rich features from images,\ne.g. attribute features. These attribute features are normally extracted within\neach individual image; however, the common traits for features across images\nyet belonging to the same attribute are not emphasized. In this paper, we\npropose a new framework to boost ZSL by explicitly learning attribute\nprototypes beyond images and contrastively optimizing them with attribute-level\nfeatures within images. Besides the novel architecture, two elements are\nhighlighted for attribute representations: a new prototype generation module is\ndesigned to generate attribute prototypes from attribute semantics; a hard\nexample-based contrastive optimization scheme is introduced to reinforce\nattribute-level features in the embedding space. We explore two alternative\nbackbones, CNN-based and transformer-based, to build our framework and conduct\nexperiments on three standard benchmarks, CUB, SUN, AwA2. Results on these\nbenchmarks demonstrate that our method improves the state of the art by a\nconsiderable margin. Our codes will be available at\nhttps://github.com/dyabel/CoAR-ZSL.git\n","authors":["Yu Du","Miaojing Shi","Fangyun Wei","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2207.03824v3.pdf","comment":"Accepted to TNNLS"},{"id":"http://arxiv.org/abs/2307.09008v1","updated":"2023-07-18T06:54:42Z","published":"2023-07-18T06:54:42Z","title":"Soft-IntroVAE for Continuous Latent space Image Super-Resolution","summary":" Continuous image super-resolution (SR) recently receives a lot of attention\nfrom researchers, for its practical and flexible image scaling for various\ndisplays. Local implicit image representation is one of the methods that can\nmap the coordinates and 2D features for latent space interpolation. Inspired by\nVariational AutoEncoder, we propose a Soft-introVAE for continuous latent space\nimage super-resolution (SVAE-SR). A novel latent space adversarial training is\nachieved for photo-realistic image restoration. To further improve the quality,\na positional encoding scheme is used to extend the original pixel coordinates\nby aggregating frequency information over the pixel areas. We show the\neffectiveness of the proposed SVAE-SR through quantitative and qualitative\ncomparisons, and further, illustrate its generalization in denoising and\nreal-image super-resolution.\n","authors":["Zhi-Song Liu","Zijia Wang","Zhen Jia"],"pdf_url":"https://arxiv.org/pdf/2307.09008v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.09005v1","updated":"2023-07-18T06:44:45Z","published":"2023-07-18T06:44:45Z","title":"Frequency-mixed Single-source Domain Generalization for Medical Image\n Segmentation","summary":" The annotation scarcity of medical image segmentation poses challenges in\ncollecting sufficient training data for deep learning models. Specifically,\nmodels trained on limited data may not generalize well to other unseen data\ndomains, resulting in a domain shift issue. Consequently, domain generalization\n(DG) is developed to boost the performance of segmentation models on unseen\ndomains. However, the DG setup requires multiple source domains, which impedes\nthe efficient deployment of segmentation algorithms in clinical scenarios. To\naddress this challenge and improve the segmentation model's generalizability,\nwe propose a novel approach called the Frequency-mixed Single-source Domain\nGeneralization method (FreeSDG). By analyzing the frequency's effect on domain\ndiscrepancy, FreeSDG leverages a mixed frequency spectrum to augment the\nsingle-source domain. Additionally, self-supervision is constructed in the\ndomain augmentation to learn robust context-aware representations for the\nsegmentation task. Experimental results on five datasets of three modalities\ndemonstrate the effectiveness of the proposed algorithm. FreeSDG outperforms\nstate-of-the-art methods and significantly improves the segmentation model's\ngeneralizability. Therefore, FreeSDG provides a promising solution for\nenhancing the generalization of medical image segmentation models, especially\nwhen annotated data is scarce. The code is available at\nhttps://github.com/liamheng/Non-IID_Medical_Image_Segmentation.\n","authors":["Heng Li","Haojin Li","Wei Zhao","Huazhu Fu","Xiuyun Su","Yan Hu","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09004v1","updated":"2023-07-18T06:44:20Z","published":"2023-07-18T06:44:20Z","title":"Ord2Seq: Regard Ordinal Regression as Label Sequence Prediction","summary":" Ordinal regression refers to classifying object instances into ordinal\ncategories. It has been widely studied in many scenarios, such as medical\ndisease grading, movie rating, etc. Known methods focused only on learning\ninter-class ordinal relationships, but still incur limitations in\ndistinguishing adjacent categories thus far. In this paper, we propose a simple\nsequence prediction framework for ordinal regression called Ord2Seq, which, for\nthe first time, transforms each ordinal category label into a special label\nsequence and thus regards an ordinal regression task as a sequence prediction\nprocess. In this way, we decompose an ordinal regression task into a series of\nrecursive binary classification steps, so as to subtly distinguish adjacent\ncategories. Comprehensive experiments show the effectiveness of distinguishing\nadjacent categories for performance improvement and our new approach exceeds\nstate-of-the-art performances in four different scenarios. Codes will be\navailable upon acceptance.\n","authors":["Jinhong Wang","Yi Cheng","Jintai Chen","Tingting Chen","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09004v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.09000v1","updated":"2023-07-18T06:35:12Z","published":"2023-07-18T06:35:12Z","title":"TractCloud: Registration-free tractography parcellation with a novel\n local-global streamline point cloud representation","summary":" Diffusion MRI tractography parcellation classifies streamlines into\nanatomical fiber tracts to enable quantification and visualization for clinical\nand scientific applications. Current tractography parcellation methods rely\nheavily on registration, but registration inaccuracies can affect parcellation\nand the computational cost of registration is high for large-scale datasets.\nRecently, deep-learning-based methods have been proposed for tractography\nparcellation using various types of representations for streamlines. However,\nthese methods only focus on the information from a single streamline, ignoring\ngeometric relationships between the streamlines in the brain. We propose\nTractCloud, a registration-free framework that performs whole-brain\ntractography parcellation directly in individual subject space. We propose a\nnovel, learnable, local-global streamline representation that leverages\ninformation from neighboring and whole-brain streamlines to describe the local\nanatomy and global pose of the brain. We train our framework on a large-scale\nlabeled tractography dataset, which we augment by applying synthetic transforms\nincluding rotation, scaling, and translations. We test our framework on five\nindependently acquired datasets across populations and health conditions.\nTractCloud significantly outperforms several state-of-the-art methods on all\ntesting datasets. TractCloud achieves efficient and consistent whole-brain\nwhite matter parcellation across the lifespan (from neonates to elderly\nsubjects, including brain tumor patients) without the need for registration.\nThe robustness and high inference speed of TractCloud make it suitable for\nlarge-scale tractography data analysis. Our project page is available at\nhttps://tractcloud.github.io/.\n","authors":["Tengfei Xue","Yuqian Chen","Chaoyi Zhang","Alexandra J. Golby","Nikos Makris","Yogesh Rathi","Weidong Cai","Fan Zhang","Lauren J. O'Donnell"],"pdf_url":"https://arxiv.org/pdf/2307.09000v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.08996v1","updated":"2023-07-18T06:31:01Z","published":"2023-07-18T06:31:01Z","title":"Towards Authentic Face Restoration with Iterative Diffusion Models and\n Beyond","summary":" An authentic face restoration system is becoming increasingly demanding in\nmany computer vision applications, e.g., image enhancement, video\ncommunication, and taking portrait. Most of the advanced face restoration\nmodels can recover high-quality faces from low-quality ones but usually fail to\nfaithfully generate realistic and high-frequency details that are favored by\nusers. To achieve authentic restoration, we propose $\\textbf{IDM}$, an\n$\\textbf{I}$teratively learned face restoration system based on denoising\n$\\textbf{D}$iffusion $\\textbf{M}$odels (DDMs). We define the criterion of an\nauthentic face restoration system, and argue that denoising diffusion models\nare naturally endowed with this property from two aspects: intrinsic iterative\nrefinement and extrinsic iterative enhancement. Intrinsic learning can preserve\nthe content well and gradually refine the high-quality details, while extrinsic\nenhancement helps clean the data and improve the restoration task one step\nfurther. We demonstrate superior performance on blind face restoration tasks.\nBeyond restoration, we find the authentically cleaned data by the proposed\nrestoration system is also helpful to image generation tasks in terms of\ntraining stabilization and sample quality. Without modifying the models, we\nachieve better quality than state-of-the-art on FFHQ and ImageNet generation\nusing either GANs or diffusion models.\n","authors":["Yang Zhao","Tingbo Hou","Yu-Chuan Su","Xuhui Jia. Yandong Li","Matthias Grundmann"],"pdf_url":"https://arxiv.org/pdf/2307.08996v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.08995v1","updated":"2023-07-18T06:27:44Z","published":"2023-07-18T06:27:44Z","title":"Revisiting Latent Space of GAN Inversion for Real Image Editing","summary":" The exploration of the latent space in StyleGANs and GAN inversion exemplify\nimpressive real-world image editing, yet the trade-off between reconstruction\nquality and editing quality remains an open problem. In this study, we revisit\nStyleGANs' hyperspherical prior $\\mathcal{Z}$ and combine it with highly\ncapable latent spaces to build combined spaces that faithfully invert real\nimages while maintaining the quality of edited images. More specifically, we\npropose $\\mathcal{F}/\\mathcal{Z}^{+}$ space consisting of two subspaces:\n$\\mathcal{F}$ space of an intermediate feature map of StyleGANs enabling\nfaithful reconstruction and $\\mathcal{Z}^{+}$ space of an extended StyleGAN\nprior supporting high editing quality. We project the real images into the\nproposed space to obtain the inverted codes, by which we then move along\n$\\mathcal{Z}^{+}$, enabling semantic editing without sacrificing image quality.\nComprehensive experiments show that $\\mathcal{Z}^{+}$ can replace the most\ncommonly-used $\\mathcal{W}$, $\\mathcal{W}^{+}$, and $\\mathcal{S}$ spaces while\npreserving reconstruction quality, resulting in reduced distortion of edited\nimages.\n","authors":["Kai Katsumata","Duc Minh Vo","Bei Liu","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2307.08995v1.pdf","comment":"10 pages, 12 figures. arXiv admin note: substantial text overlap with\n arXiv:2306.00241"},{"id":"http://arxiv.org/abs/2307.08994v1","updated":"2023-07-18T06:15:23Z","published":"2023-07-18T06:15:23Z","title":"Human Action Recognition in Still Images Using ConViT","summary":" Understanding the relationship between different parts of the image plays a\ncrucial role in many visual recognition tasks. Despite the fact that\nConvolutional Neural Networks (CNNs) have demonstrated impressive results in\ndetecting single objects, they lack the capability to extract the relationship\nbetween various regions of an image, which is a crucial factor in human action\nrecognition. To address this problem, this paper proposes a new module that\nfunctions like a convolutional layer using Vision Transformer (ViT). The\nproposed action recognition model comprises two components: the first part is a\ndeep convolutional network that extracts high-level spatial features from the\nimage, and the second component of the model utilizes a Vision Transformer that\nextracts the relationship between various regions of the image using the\nfeature map generated by the CNN output. The proposed model has been evaluated\non the Stanford40 and PASCAL VOC 2012 action datasets and has achieved 95.5%\nmAP and 91.5% mAP results, respectively, which are promising compared to other\nstate-of-the-art methods.\n","authors":["Seyed Rohollah Hosseyni","Hasan Taheri","Sanaz Seyedin","Ali Ahmad Rahmani"],"pdf_url":"https://arxiv.org/pdf/2307.08994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08992v1","updated":"2023-07-18T06:11:09Z","published":"2023-07-18T06:11:09Z","title":"Arbitrary point cloud upsampling via Dual Back-Projection Network","summary":" Point clouds acquired from 3D sensors are usually sparse and noisy. Point\ncloud upsampling is an approach to increase the density of the point cloud so\nthat detailed geometric information can be restored. In this paper, we propose\na Dual Back-Projection network for point cloud upsampling (DBPnet). A Dual\nBack-Projection is formulated in an up-down-up manner for point cloud\nupsampling. It not only back projects feature residues but also coordinates\nresidues so that the network better captures the point correlations in the\nfeature and space domains, achieving lower reconstruction errors on both\nuniform and non-uniform sparse point clouds. Our proposed method is also\ngeneralizable for arbitrary upsampling tasks (e.g. 4x, 5.5x). Experimental\nresults show that the proposed method achieves the lowest point set matching\nlosses with respect to the benchmark. In addition, the success of our approach\ndemonstrates that generative networks are not necessarily needed for\nnon-uniform point clouds.\n","authors":["Zhi-Song Liu","Zijia Wang","Zhen Jia"],"pdf_url":"https://arxiv.org/pdf/2307.08992v1.pdf","comment":"5 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.08991v1","updated":"2023-07-18T06:07:25Z","published":"2023-07-18T06:07:25Z","title":"EgoVM: Achieving Precise Ego-Localization using Lightweight Vectorized\n Maps","summary":" Accurate and reliable ego-localization is critical for autonomous driving. In\nthis paper, we present EgoVM, an end-to-end localization network that achieves\ncomparable localization accuracy to prior state-of-the-art methods, but uses\nlightweight vectorized maps instead of heavy point-based maps. To begin with,\nwe extract BEV features from online multi-view images and LiDAR point cloud.\nThen, we employ a set of learnable semantic embeddings to encode the semantic\ntypes of map elements and supervise them with semantic segmentation, to make\ntheir feature representation consistent with BEV features. After that, we feed\nmap queries, composed of learnable semantic embeddings and coordinates of map\nelements, into a transformer decoder to perform cross-modality matching with\nBEV features. Finally, we adopt a robust histogram-based pose solver to\nestimate the optimal pose by searching exhaustively over candidate poses. We\ncomprehensively validate the effectiveness of our method using both the\nnuScenes dataset and a newly collected dataset. The experimental results show\nthat our method achieves centimeter-level localization accuracy, and\noutperforms existing methods using vectorized maps by a large margin.\nFurthermore, our model has been extensively tested in a large fleet of\nautonomous vehicles under various challenging urban scenes.\n","authors":["Yuzhe He","Shuang Liang","Xiaofei Rui","Chengying Cai","Guowei Wan"],"pdf_url":"https://arxiv.org/pdf/2307.08991v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.08988v1","updated":"2023-07-18T05:59:27Z","published":"2023-07-18T05:59:27Z","title":"EVIL: Evidential Inference Learning for Trustworthy Semi-supervised\n Medical Image Segmentation","summary":" Recently, uncertainty-aware methods have attracted increasing attention in\nsemi-supervised medical image segmentation. However, current methods usually\nsuffer from the drawback that it is difficult to balance the computational\ncost, estimation accuracy, and theoretical support in a unified framework. To\nalleviate this problem, we introduce the Dempster-Shafer Theory of Evidence\n(DST) into semi-supervised medical image segmentation, dubbed Evidential\nInference Learning (EVIL). EVIL provides a theoretically guaranteed solution to\ninfer accurate uncertainty quantification in a single forward pass. Trustworthy\npseudo labels on unlabeled data are generated after uncertainty estimation. The\nrecently proposed consistency regularization-based training paradigm is adopted\nin our framework, which enforces the consistency on the perturbed predictions\nto enhance the generalization with few labeled data. Experimental results show\nthat EVIL achieves competitive performance in comparison with several\nstate-of-the-art methods on the public dataset.\n","authors":["Yingyu Chen","Ziyuan Yang","Chenyu Shen","Zhiwen Wang","Yang Qin","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08984v1","updated":"2023-07-18T05:42:01Z","published":"2023-07-18T05:42:01Z","title":"In Defense of Clip-based Video Relation Detection","summary":" Video Visual Relation Detection (VidVRD) aims to detect visual relationship\ntriplets in videos using spatial bounding boxes and temporal boundaries.\nExisting VidVRD methods can be broadly categorized into bottom-up and top-down\nparadigms, depending on their approach to classifying relations. Bottom-up\nmethods follow a clip-based approach where they classify relations of short\nclip tubelet pairs and then merge them into long video relations. On the other\nhand, top-down methods directly classify long video tubelet pairs. While recent\nvideo-based methods utilizing video tubelets have shown promising results, we\nargue that the effective modeling of spatial and temporal context plays a more\nsignificant role than the choice between clip tubelets and video tubelets. This\nmotivates us to revisit the clip-based paradigm and explore the key success\nfactors in VidVRD. In this paper, we propose a Hierarchical Context Model (HCM)\nthat enriches the object-based spatial context and relation-based temporal\ncontext based on clips. We demonstrate that using clip tubelets can achieve\nsuperior performance compared to most video-based methods. Additionally, using\nclip tubelets offers more flexibility in model designs and helps alleviate the\nlimitations associated with video tubelets, such as the challenging long-term\nobject tracking problem and the loss of temporal information in long-term\ntubelet feature compression. Extensive experiments conducted on two challenging\nVidVRD benchmarks validate that our HCM achieves a new state-of-the-art\nperformance, highlighting the effectiveness of incorporating advanced spatial\nand temporal context modeling within the clip-based paradigm.\n","authors":["Meng Wei","Long Chen","Wei Ji","Xiaoyu Yue","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2307.08984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08978v1","updated":"2023-07-18T05:22:25Z","published":"2023-07-18T05:22:25Z","title":"Learned Scalable Video Coding For Humans and Machines","summary":" Video coding has traditionally been developed to support services such as\nvideo streaming, videoconferencing, digital TV, and so on. The main intent was\nto enable human viewing of the encoded content. However, with the advances in\ndeep neural networks (DNNs), encoded video is increasingly being used for\nautomatic video analytics performed by machines. In applications such as\nautomatic traffic monitoring, analytics such as vehicle detection, tracking and\ncounting, would run continuously, while human viewing could be required\noccasionally to review potential incidents. To support such applications, a new\nparadigm for video coding is needed that will facilitate efficient\nrepresentation and compression of video for both machine and human use in a\nscalable manner. In this manuscript, we introduce the first end-to-end\nlearnable video codec that supports a machine vision task in its base layer,\nwhile its enhancement layer supports input reconstruction for human viewing.\nThe proposed system is constructed based on the concept of conditional coding\nto achieve better compression gains. Comprehensive experimental evaluations\nconducted on four standard video datasets demonstrate that our framework\noutperforms both state-of-the-art learned and conventional video codecs in its\nbase layer, while maintaining comparable performance on the human vision task\nin its enhancement layer. We will provide the implementation of the proposed\nsystem at www.github.com upon completion of the review process.\n","authors":["Hadi Hadizadeh","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.08978v1.pdf","comment":"14 pages, 16 figures"},{"id":"http://arxiv.org/abs/2305.16656v3","updated":"2023-07-18T04:23:26Z","published":"2023-05-26T05:58:14Z","title":"Clustering Method for Time-Series Images Using Quantum-Inspired\n Computing Technology","summary":" Time-series clustering serves as a powerful data mining technique for\ntime-series data in the absence of prior knowledge about clusters. A large\namount of time-series data with large size has been acquired and used in\nvarious research fields. Hence, clustering method with low computational cost\nis required. Given that a quantum-inspired computing technology, such as a\nsimulated annealing machine, surpasses conventional computers in terms of fast\nand accurately solving combinatorial optimization problems, it holds promise\nfor accomplishing clustering tasks that are challenging to achieve using\nexisting methods. This study proposes a novel time-series clustering method\nthat leverages an annealing machine. The proposed method facilitates an even\nclassification of time-series data into clusters close to each other while\nmaintaining robustness against outliers. Moreover, its applicability extends to\ntime-series images. We compared the proposed method with a standard existing\nmethod for clustering an online distributed dataset. In the existing method,\nthe distances between each data are calculated based on the Euclidean distance\nmetric, and the clustering is performed using the k-means++ method. We found\nthat both methods yielded comparable results. Furthermore, the proposed method\nwas applied to a flow measurement image dataset containing noticeable noise\nwith a signal-to-noise ratio of approximately 1. Despite a small signal\nvariation of approximately 2%, the proposed method effectively classified the\ndata without any overlap among the clusters. In contrast, the clustering\nresults by the standard existing method and the conditional image sampling\n(CIS) method, a specialized technique for flow measurement data, displayed\noverlapping clusters. Consequently, the proposed method provides better results\nthan the other two methods, demonstrating its potential as a superior\nclustering method.\n","authors":["Tomoki Inoue","Koyo Kubota","Tsubasa Ikami","Yasuhiro Egami","Hiroki Nagai","Takahiro Kashikawa","Koichi Kimura","Yu Matsuda"],"pdf_url":"https://arxiv.org/pdf/2305.16656v3.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2306.13074v2","updated":"2023-07-18T04:00:22Z","published":"2023-06-22T17:47:08Z","title":"Iterative Scale-Up ExpansionIoU and Deep Features Association for\n Multi-Object Tracking in Sports","summary":" Multi-object tracking algorithms have made significant advancements due to\nthe recent developments in object detection. However, most existing methods\nprimarily focus on tracking pedestrians or vehicles, which exhibit relatively\nsimple and regular motion patterns. Consequently, there is a scarcity of\nalgorithms that address the tracking of targets with irregular or non-linear\nmotion, such as multi-athlete tracking. Furthermore, popular tracking\nalgorithms often rely on the Kalman filter for object motion modeling, which\nfails to track objects when their motion contradicts the linear motion\nassumption of the Kalman filter. Due to this reason, we proposed a novel online\nand robust multi-object tracking approach, named Iterative Scale-Up\nExpansionIoU and Deep Features for multi-object tracking. Unlike conventional\nmethods, we abandon the use of the Kalman filter and propose utilizing the\niterative scale-up expansion IoU. This approach achieves superior tracking\nperformance without requiring additional training data or adopting a more\nrobust detector, all while maintaining a lower computational cost compared to\nother appearance-based methods. Our proposed method demonstrates remarkable\neffectiveness in tracking irregular motion objects, achieving a score of 76.9%\nin HOTA. It outperforms all state-of-the-art tracking algorithms on the\nSportsMOT dataset, covering various kinds of sport scenarios.\n","authors":["Hsiang-Wei Huang","Cheng-Yen Yang","Jiacheng Sun","Jenq-Neng Hwang","Chung-I Huang"],"pdf_url":"https://arxiv.org/pdf/2306.13074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08950v1","updated":"2023-07-18T03:37:10Z","published":"2023-07-18T03:37:10Z","title":"Deep Physics-Guided Unrolling Generalization for Compressed Sensing","summary":" By absorbing the merits of both the model- and data-driven methods, deep\nphysics-engaged learning scheme achieves high-accuracy and interpretable image\nreconstruction. It has attracted growing attention and become the mainstream\nfor inverse imaging tasks. Focusing on the image compressed sensing (CS)\nproblem, we find the intrinsic defect of this emerging paradigm, widely\nimplemented by deep algorithm-unrolled networks, in which more plain iterations\ninvolving real physics will bring enormous computation cost and long inference\ntime, hindering their practical application. A novel deep\n$\\textbf{P}$hysics-guided un$\\textbf{R}$olled recovery $\\textbf{L}$earning\n($\\textbf{PRL}$) framework is proposed by generalizing the traditional\niterative recovery model from image domain (ID) to the high-dimensional feature\ndomain (FD). A compact multiscale unrolling architecture is then developed to\nenhance the network capacity and keep real-time inference speeds. Taking two\ndifferent perspectives of optimization and range-nullspace decomposition,\ninstead of building an algorithm-specific unrolled network, we provide two\nimplementations: $\\textbf{PRL-PGD}$ and $\\textbf{PRL-RND}$. Experiments exhibit\nthe significant performance and efficiency leading of PRL networks over other\nstate-of-the-art methods with a large potential for further improvement and\nreal application to other inverse imaging problems or optimization models.\n","authors":["Bin Chen","Jiechong Song","Jingfen Xie","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08950v1.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV) 2023"},{"id":"http://arxiv.org/abs/2303.02885v2","updated":"2023-07-18T03:35:20Z","published":"2023-03-06T04:32:34Z","title":"Improving Transformer-based Image Matching by Cascaded Capturing\n Spatially Informative Keypoints","summary":" Learning robust local image feature matching is a fundamental low-level\nvision task, which has been widely explored in the past few years. Recently,\ndetector-free local feature matchers based on transformers have shown promising\nresults, which largely outperform pure Convolutional Neural Network (CNN) based\nones. But correlations produced by transformer-based methods are spatially\nlimited to the center of source views' coarse patches, because of the costly\nattention learning. In this work, we rethink this issue and find that such\nmatching formulation degrades pose estimation, especially for low-resolution\nimages. So we propose a transformer-based cascade matching model -- Cascade\nfeature Matching TRansformer (CasMTR), to efficiently learn dense feature\ncorrelations, which allows us to choose more reliable matching pairs for the\nrelative pose estimation. Instead of re-training a new detector, we use a\nsimple yet effective Non-Maximum Suppression (NMS) post-process to filter\nkeypoints through the confidence map, and largely improve the matching\nprecision. CasMTR achieves state-of-the-art performance in indoor and outdoor\npose estimation as well as visual localization. Moreover, thorough ablations\nshow the efficacy of the proposed components and techniques.\n","authors":["Chenjie Cao","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2303.02885v2.pdf","comment":"Accepted by ICCV2023, Codes will be released in\n https://github.com/ewrfcas/CasMTR"},{"id":"http://arxiv.org/abs/2303.02401v4","updated":"2023-07-18T03:21:11Z","published":"2023-03-04T12:26:47Z","title":"Open-Vocabulary Affordance Detection in 3D Point Clouds","summary":" Affordance detection is a challenging problem with a wide variety of robotic\napplications. Traditional affordance detection methods are limited to a\npredefined set of affordance labels, hence potentially restricting the\nadaptability of intelligent robots in complex and dynamic environments. In this\npaper, we present the Open-Vocabulary Affordance Detection (OpenAD) method,\nwhich is capable of detecting an unbounded number of affordances in 3D point\nclouds. By simultaneously learning the affordance text and the point feature,\nOpenAD successfully exploits the semantic relationships between affordances.\nTherefore, our proposed method enables zero-shot detection and can be able to\ndetect previously unseen affordances without a single annotation example.\nIntensive experimental results show that OpenAD works effectively on a wide\nrange of affordance detection setups and outperforms other baselines by a large\nmargin. Additionally, we demonstrate the practicality of the proposed OpenAD in\nreal-world robotic applications with a fast inference speed (~100ms). Our\nproject is available at https://openad2023.github.io.\n","authors":["Toan Nguyen","Minh Nhat Vu","An Vuong","Dzung Nguyen","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2303.02401v4.pdf","comment":"Accepted to The 2023 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2023)"},{"id":"http://arxiv.org/abs/2307.08939v1","updated":"2023-07-18T03:12:03Z","published":"2023-07-18T03:12:03Z","title":"Experimental Security Analysis of DNN-based Adaptive Cruise Control\n under Context-Aware Perception Attacks","summary":" Adaptive Cruise Control (ACC) is a widely used driver assistance feature for\nmaintaining desired speed and safe distance to the leading vehicles. This paper\nevaluates the security of the deep neural network (DNN) based ACC systems under\nstealthy perception attacks that strategically inject perturbations into camera\ndata to cause forward collisions. We present a combined\nknowledge-and-data-driven approach to design a context-aware strategy for the\nselection of the most critical times for triggering the attacks and a novel\noptimization-based method for the adaptive generation of image perturbations at\nrun-time. We evaluate the effectiveness of the proposed attack using an actual\ndriving dataset and a realistic simulation platform with the control software\nfrom a production ACC system and a physical-world driving simulator while\nconsidering interventions by the driver and safety features such as Automatic\nEmergency Braking (AEB) and Forward Collision Warning (FCW). Experimental\nresults show that the proposed attack achieves 142.9x higher success rate in\ncausing accidents than random attacks and is mitigated 89.6% less by the safety\nfeatures while being stealthy and robust to real-world factors and dynamic\nchanges in the environment. This study provides insights into the role of human\noperators and basic safety interventions in preventing attacks.\n","authors":["Xugui Zhou","Anqi Chen","Maxfield Kouzel","Haotian Ren","Morgan McCarty","Cristina Nita-Rotaru","Homa Alemzadeh"],"pdf_url":"https://arxiv.org/pdf/2307.08939v1.pdf","comment":"18 pages, 14 figures, 8 tables"},{"id":"http://arxiv.org/abs/2303.15932v5","updated":"2023-07-18T03:06:31Z","published":"2023-03-28T12:42:12Z","title":"Unify, Align and Refine: Multi-Level Semantic Alignment for Radiology\n Report Generation","summary":" Automatic radiology report generation has attracted enormous research\ninterest due to its practical value in reducing the workload of radiologists.\nHowever, simultaneously establishing global correspondences between the image\n(e.g., Chest X-ray) and its related report and local alignments between image\npatches and keywords remains challenging. To this end, we propose an Unify,\nAlign and then Refine (UAR) approach to learn multi-level cross-modal\nalignments and introduce three novel modules: Latent Space Unifier (LSU),\nCross-modal Representation Aligner (CRA) and Text-to-Image Refiner (TIR).\nSpecifically, LSU unifies multimodal data into discrete tokens, making it\nflexible to learn common knowledge among modalities with a shared network. The\nmodality-agnostic CRA learns discriminative features via a set of orthonormal\nbasis and a dual-gate mechanism first and then globally aligns visual and\ntextual representations under a triplet contrastive loss. TIR boosts\ntoken-level local alignment via calibrating text-to-image attention with a\nlearnable mask. Additionally, we design a two-stage training procedure to make\nUAR gradually grasp cross-modal alignments at different levels, which imitates\nradiologists' workflow: writing sentence by sentence first and then checking\nword by word. Extensive experiments and analyses on IU-Xray and MIMIC-CXR\nbenchmark datasets demonstrate the superiority of our UAR against varied\nstate-of-the-art methods.\n","authors":["Yaowei Li","Bang Yang","Xuxin Cheng","Zhihong Zhu","Hongxiang Li","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2303.15932v5.pdf","comment":"1)Reassessment of author contributions. 2)Try to solve the problem\n that Google Scholar does not display the all authors"},{"id":"http://arxiv.org/abs/2307.08930v1","updated":"2023-07-18T02:35:01Z","published":"2023-07-18T02:35:01Z","title":"Unsupervised Deep Graph Matching Based on Cycle Consistency","summary":" We contribute to the sparsely populated area of unsupervised deep graph\nmatching with application to keypoint matching in images. Contrary to the\nstandard \\emph{supervised} approach, our method does not require ground truth\ncorrespondences between keypoint pairs. Instead, it is self-supervised by\nenforcing consistency of matchings between images of the same object category.\nAs the matching and the consistency loss are discrete, their derivatives cannot\nbe straightforwardly used for learning. We address this issue in a principled\nway by building our method upon the recent results on black-box differentiation\nof combinatorial solvers. This makes our method exceptionally flexible, as it\nis compatible with arbitrary network architectures and combinatorial solvers.\nOur experimental evaluation suggests that our technique sets a new\nstate-of-the-art for unsupervised graph matching.\n","authors":["Siddharth Tourani","Carsten Rother","Muhammad Haris Khan","Bogdan Savchynskkyy"],"pdf_url":"https://arxiv.org/pdf/2307.08930v1.pdf","comment":"12 pages, 5 figures, 3 papers. ICCV 2023 reject"},{"id":"http://arxiv.org/abs/2307.07916v2","updated":"2023-07-18T02:26:30Z","published":"2023-07-16T01:45:00Z","title":"On the Robustness of Split Learning against Adversarial Attacks","summary":" Split learning enables collaborative deep learning model training while\npreserving data privacy and model security by avoiding direct sharing of raw\ndata and model details (i.e., sever and clients only hold partial sub-networks\nand exchange intermediate computations). However, existing research has mainly\nfocused on examining its reliability for privacy protection, with little\ninvestigation into model security. Specifically, by exploring full models,\nattackers can launch adversarial attacks, and split learning can mitigate this\nsevere threat by only disclosing part of models to untrusted servers.This paper\naims to evaluate the robustness of split learning against adversarial attacks,\nparticularly in the most challenging setting where untrusted servers only have\naccess to the intermediate layers of the model.Existing adversarial attacks\nmostly focus on the centralized setting instead of the collaborative setting,\nthus, to better evaluate the robustness of split learning, we develop a\ntailored attack called SPADV, which comprises two stages: 1) shadow model\ntraining that addresses the issue of lacking part of the model and 2) local\nadversarial attack that produces adversarial examples to evaluate.The first\nstage only requires a few unlabeled non-IID data, and, in the second stage,\nSPADV perturbs the intermediate output of natural samples to craft the\nadversarial ones. The overall cost of the proposed attack process is relatively\nlow, yet the empirical attack effectiveness is significantly high,\ndemonstrating the surprising vulnerability of split learning to adversarial\nattacks.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Wenmeng Zhou","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.07916v2.pdf","comment":"accepted by ECAI 2023, camera-ready version"},{"id":"http://arxiv.org/abs/2306.07615v5","updated":"2023-07-18T02:15:43Z","published":"2023-06-13T08:19:14Z","title":"UOD: Universal One-shot Detection of Anatomical Landmarks","summary":" One-shot medical landmark detection gains much attention and achieves great\nsuccess for its label-efficient training process. However, existing one-shot\nlearning methods are highly specialized in a single domain and suffer domain\npreference heavily in the situation of multi-domain unlabeled data. Moreover,\none-shot learning is not robust that it faces performance drop when annotating\na sub-optimal image. To tackle these issues, we resort to developing a\ndomain-adaptive one-shot landmark detection framework for handling multi-domain\nmedical images, named Universal One-shot Detection (UOD). UOD consists of two\nstages and two corresponding universal models which are designed as\ncombinations of domain-specific modules and domain-shared modules. In the first\nstage, a domain-adaptive convolution model is self-supervised learned to\ngenerate pseudo landmark labels. In the second stage, we design a\ndomain-adaptive transformer to eliminate domain preference and build the global\ncontext for multi-domain data. Even though only one annotated sample from each\ndomain is available for training, the domain-shared modules help UOD aggregate\nall one-shot samples to detect more robust and accurate landmarks. We\ninvestigated both qualitatively and quantitatively the proposed UOD on three\nwidely-used public X-ray datasets in different anatomical domains (i.e., head,\nhand, chest) and obtained state-of-the-art performances in each domain. The\ncode is available at\nhttps://github.com/heqin-zhu/UOD_universal_oneshot_detection.\n","authors":["Heqin Zhu","Quan Quan","Qingsong Yao","Zaiyi Liu","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.07615v5.pdf","comment":"Eealy accepted by MICCAI 2023. 11pages, 4 figures, 2 tables. arXiv\n admin note: text overlap with arXiv:2203.06433"},{"id":"http://arxiv.org/abs/2307.08924v1","updated":"2023-07-18T01:53:18Z","published":"2023-07-18T01:53:18Z","title":"Learning to Sample Tasks for Meta Learning","summary":" Through experiments on various meta-learning methods, task samplers, and\nfew-shot learning tasks, this paper arrives at three conclusions. Firstly,\nthere are no universal task sampling strategies to guarantee the performance of\nmeta-learning models. Secondly, task diversity can cause the models to either\nunderfit or overfit during training. Lastly, the generalization performance of\nthe models are influenced by task divergence, task entropy, and task\ndifficulty. In response to these findings, we propose a novel task sampler\ncalled Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes\ntask divergence, task entropy, and task difficulty to sample tasks. To optimize\nASr, we rethink and propose a simple and general meta-learning algorithm.\nFinally, a large number of empirical experiments demonstrate the effectiveness\nof the proposed ASr.\n","authors":["Jingyao Wang","Zeen Song","Xingzhe Su","Lingyu Si","Hongwei Dong","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08924v1.pdf","comment":"10 pages, 7 tables, 3 figures"},{"id":"http://arxiv.org/abs/2209.15304v3","updated":"2023-07-18T01:40:57Z","published":"2022-09-30T08:23:26Z","title":"Hiding Visual Information via Obfuscating Adversarial Perturbations","summary":" Growing leakage and misuse of visual information raise security and privacy\nconcerns, which promotes the development of information protection. Existing\nadversarial perturbations-based methods mainly focus on the de-identification\nagainst deep learning models. However, the inherent visual information of the\ndata has not been well protected. In this work, inspired by the Type-I\nadversarial attack, we propose an adversarial visual information hiding method\nto protect the visual privacy of data. Specifically, the method generates\nobfuscating adversarial perturbations to obscure the visual information of the\ndata. Meanwhile, it maintains the hidden objectives to be correctly predicted\nby models. In addition, our method does not modify the parameters of the\napplied model, which makes it flexible for different scenarios. Experimental\nresults on the recognition and classification tasks demonstrate that the\nproposed method can effectively hide visual information and hardly affect the\nperformances of models. The code is available in the supplementary material.\n","authors":["Zhigang Su","Dawei Zhou","Decheng Liu","Nannan Wang","Zhen Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2209.15304v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08919v1","updated":"2023-07-18T01:31:47Z","published":"2023-07-18T01:31:47Z","title":"Accuracy versus time frontiers of semi-supervised and self-supervised\n learning on medical images","summary":" For many applications of classifiers to medical images, a trustworthy label\nfor each image can be difficult or expensive to obtain. In contrast, images\nwithout labels are more readily available. Two major research directions both\npromise that additional unlabeled data can improve classifier performance:\nself-supervised learning pretrains useful representations on unlabeled data\nonly, then fine-tunes a classifier on these representations via the labeled\nset; semi-supervised learning directly trains a classifier on labeled and\nunlabeled data simultaneously. Recent methods from both directions have claimed\nsignificant gains on non-medical tasks, but do not systematically assess\nmedical images and mostly compare only to methods in the same direction. This\nstudy contributes a carefully-designed benchmark to help answer a\npractitioner's key question: given a small labeled dataset and a limited budget\nof hours to spend on training, what gains from additional unlabeled images are\npossible and which methods best achieve them? Unlike previous benchmarks, ours\nuses realistic-sized validation sets to select hyperparameters, assesses\nruntime-performance tradeoffs, and bridges two research fields. By comparing 6\nsemi-supervised methods and 5 self-supervised methods to strong labeled-only\nbaselines on 3 medical datasets with 30-1000 labels per class, we offer\ninsights to resource-constrained, results-focused practitioners: MixMatch,\nSimCLR, and BYOL represent strong choices that were not surpassed by more\nrecent methods. After much effort selecting hyperparameters on one dataset, we\npublish settings that enable strong methods to perform well on new medical\ntasks within a few hours, with further search over dozens of hours delivering\nmodest additional gains.\n","authors":["Zhe Huang","Ruijie Jiang","Shuchin Aeron","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2307.08919v1.pdf","comment":"Semi-supervised Learning; Self-supervised Learning; Medical Imaging"},{"id":"http://arxiv.org/abs/2307.08913v1","updated":"2023-07-18T01:16:23Z","published":"2023-07-18T01:16:23Z","title":"Towards the Sparseness of Projection Head in Self-Supervised Learning","summary":" In recent years, self-supervised learning (SSL) has emerged as a promising\napproach for extracting valuable representations from unlabeled data. One\nsuccessful SSL method is contrastive learning, which aims to bring positive\nexamples closer while pushing negative examples apart. Many current contrastive\nlearning approaches utilize a parameterized projection head. Through a\ncombination of empirical analysis and theoretical investigation, we provide\ninsights into the internal mechanisms of the projection head and its\nrelationship with the phenomenon of dimensional collapse. Our findings\ndemonstrate that the projection head enhances the quality of representations by\nperforming contrastive loss in a projected subspace. Therefore, we propose an\nassumption that only a subset of features is necessary when minimizing the\ncontrastive loss of a mini-batch of data. Theoretical analysis further suggests\nthat a sparse projection head can enhance generalization, leading us to\nintroduce SparseHead - a regularization term that effectively constrains the\nsparsity of the projection head, and can be seamlessly integrated with any\nself-supervised learning (SSL) approaches. Our experimental results validate\nthe effectiveness of SparseHead, demonstrating its ability to improve the\nperformance of existing contrastive methods.\n","authors":["Zeen Song","Xingzhe Su","Jingyao Wang","Wenwen Qiang","Changwen Zheng","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2307.08913v1.pdf","comment":"9 pages,3 figures"},{"id":"http://arxiv.org/abs/2307.08908v1","updated":"2023-07-18T00:48:56Z","published":"2023-07-18T00:48:56Z","title":"What Can Simple Arithmetic Operations Do for Temporal Modeling?","summary":" Temporal modeling plays a crucial role in understanding video content. To\ntackle this problem, previous studies built complicated temporal relations\nthrough time sequence thanks to the development of computationally powerful\ndevices. In this work, we explore the potential of four simple arithmetic\noperations for temporal modeling. Specifically, we first capture auxiliary\ntemporal cues by computing addition, subtraction, multiplication, and division\nbetween pairs of extracted frame features. Then, we extract corresponding\nfeatures from these cues to benefit the original temporal-irrespective domain.\nWe term such a simple pipeline as an Arithmetic Temporal Module (ATM), which\noperates on the stem of a visual backbone with a plug-andplay style. We conduct\ncomprehensive ablation studies on the instantiation of ATMs and demonstrate\nthat this module provides powerful temporal modeling capability at a low\ncomputational cost. Moreover, the ATM is compatible with both CNNs- and\nViTs-based architectures. Our results show that ATM achieves superior\nperformance over several popular video benchmarks. Specifically, on\nSomething-Something V1, V2 and Kinetics-400, we reach top-1 accuracy of 65.6%,\n74.6%, and 89.4% respectively. The code is available at\nhttps://github.com/whwu95/ATM.\n","authors":["Wenhao Wu","Yuxin Song","Zhun Sun","Jingdong Wang","Chang Xu","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2307.08908v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09676v1","updated":"2023-07-18T23:06:47Z","published":"2023-07-18T23:06:47Z","title":"Domain Adaptation for Enhanced Object Detection in Foggy and Rainy\n Weather for Autonomous Driving","summary":" Most object detection models for autonomous driving may experience a\nsignificant drop in performance when deployed in real-world applications, due\nto the well-known domain shift issue. Supervised object detection methods for\nautonomous driving usually assume a consistent feature distribution between\ntraining and testing data, however, such assumptions may not always be the case\nwhen weather conditions differ significantly. For example, an object detection\nmodel trained under clear weather may not perform well in foggy or rainy\nweather, due to the domain gap. Overcoming detection bottlenecks in foggy or\nrainy weather scenarios is a significant challenge for autonomous vehicles\ndeployed in the wild. To address the domain gap in different weather\nconditions, This paper proposes a novel domain adaptive object detection\nframework for autonomous driving in foggy and rainy weather. Our method\nleverages both image-level and object-level adaptation to reduce the domain\ndiscrepancy in image style and object appearance. Additionally, to enhance the\nmodel's performance under challenging samples, we introduce a new adversarial\ngradient reversal layer that performs adversarial mining on hard examples\nalongside domain adaptation. Moreover, we propose to generate an auxiliary\ndomain by data augmentation to enforce a new domain-level metric\nregularization. Experimental results on public benchmarks demonstrate that\nobject detection performance is significantly improved when using our proposed\nmethod in domain shift scenarios for autonomous driving applications.\n","authors":["Jinlong Li","Runsheng Xu","Jin Ma","Qin Zou","Jiaqi Ma","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2307.09676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07935v2","updated":"2023-07-18T22:33:55Z","published":"2023-07-16T03:54:10Z","title":"S2R-ViT for Multi-Agent Cooperative Perception: Bridging the Gap from\n Simulation to Reality","summary":" Due to the lack of real multi-agent data and time-consuming of labeling,\nexisting multi-agent cooperative perception algorithms usually select the\nsimulated sensor data for training and validating. However, the perception\nperformance is degraded when these simulation-trained models are deployed to\nthe real world, due to the significant domain gap between the simulated and\nreal data. In this paper, we propose the first Simulation-to-Reality transfer\nlearning framework for multi-agent cooperative perception using a novel Vision\nTransformer, named as S2R-ViT, which considers both the Implementation Gap and\nFeature Gap between simulated and real data. We investigate the effects of\nthese two types of domain gaps and propose a novel uncertainty-aware vision\ntransformer to effectively relief the Implementation Gap and an agent-based\nfeature adaptation module with inter-agent and ego-agent discriminators to\nreduce the Feature Gap. Our intensive experiments on the public multi-agent\ncooperative perception datasets OPV2V and V2V4Real demonstrate that the\nproposed S2R-ViT can effectively bridge the gap from simulation to reality and\noutperform other methods significantly for point cloud-based 3D object\ndetection.\n","authors":["Jinlong Li","Runsheng Xu","Xinyu Liu","Baolu Li","Qin Zou","Jiaqi Ma","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2307.07935v2.pdf","comment":"correct the complie error in Fig.5"},{"id":"http://arxiv.org/abs/2307.09662v1","updated":"2023-07-18T22:04:41Z","published":"2023-07-18T22:04:41Z","title":"Object-aware Gaze Target Detection","summary":" Gaze target detection aims to predict the image location where the person is\nlooking and the probability that a gaze is out of the scene. Several works have\ntackled this task by regressing a gaze heatmap centered on the gaze location,\nhowever, they overlooked decoding the relationship between the people and the\ngazed objects. This paper proposes a Transformer-based architecture that\nautomatically detects objects (including heads) in the scene to build\nassociations between every head and the gazed-head/object, resulting in a\ncomprehensive, explainable gaze analysis composed of: gaze target area, gaze\npixel point, the class and the image location of the gazed-object. Upon\nevaluation of the in-the-wild benchmarks, our method achieves state-of-the-art\nresults on all metrics (up to 2.91% gain in AUC, 50% reduction in gaze\ndistance, and 9% gain in out-of-frame average precision) for gaze target\ndetection and 11-13% improvement in average precision for the classification\nand the localization of the gazed-objects. The code of the proposed method is\navailable https://github.com/francescotonini/object-aware-gaze-target-detection\n","authors":["Francesco Tonini","Nicola Dall'Asen","Cigdem Beyan","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2307.09662v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2210.03189v2","updated":"2023-07-18T21:44:47Z","published":"2022-10-06T20:06:24Z","title":"FocalUNETR: A Focal Transformer for Boundary-aware Segmentation of CT\n Images","summary":" Computed Tomography (CT) based precise prostate segmentation for treatment\nplanning is challenging due to (1) the unclear boundary of the prostate derived\nfrom CT's poor soft tissue contrast and (2) the limitation of convolutional\nneural network-based models in capturing long-range global context. Here we\npropose a novel focal transformer-based image segmentation architecture to\neffectively and efficiently extract local visual features and global context\nfrom CT images. Additionally, we design an auxiliary boundary-induced label\nregression task coupled with the main prostate segmentation task to address the\nunclear boundary issue in CT images. We demonstrate that this design\nsignificantly improves the quality of the CT-based prostate segmentation task\nover other competing methods, resulting in substantially improved performance,\ni.e., higher Dice Similarity Coefficient, lower Hausdorff Distance, and Average\nSymmetric Surface Distance, on both private and public CT image datasets. Our\ncode is available at this\n\\href{https://github.com/ChengyinLee/FocalUNETR.git}{link}.\n","authors":["Chengyin Li","Yao Qiang","Rafi Ibn Sultan","Hassan Bagher-Ebadian","Prashant Khanduri","Indrin J. Chetty","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2210.03189v2.pdf","comment":"13 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.05151v2","updated":"2023-07-18T21:40:51Z","published":"2023-07-11T10:14:41Z","title":"ExFaceGAN: Exploring Identity Directions in GAN's Learned Latent Space\n for Synthetic Identity Generation","summary":" Deep generative models have recently presented impressive results in\ngenerating realistic face images of random synthetic identities.\n To generate multiple samples of a certain synthetic identity, previous works\nproposed to disentangle the latent space of GANs by incorporating additional\nsupervision or regularization, enabling the manipulation of certain attributes.\nOthers proposed to disentangle specific factors in unconditional pretrained\nGANs latent spaces to control their output, which also requires supervision by\nattribute classifiers. Moreover, these attributes are entangled in GAN's latent\nspace, making it difficult to manipulate them without affecting the identity\ninformation. We propose in this work a framework, ExFaceGAN, to disentangle\nidentity information in pretrained GANs latent spaces, enabling the generation\nof multiple samples of any synthetic identity. Given a reference latent code of\nany synthetic image and latent space of pretrained GAN, our ExFaceGAN learns an\nidentity directional boundary that disentangles the latent space into two\nsub-spaces, with latent codes of samples that are either identity similar or\ndissimilar to a reference image. By sampling from each side of the boundary,\nour ExFaceGAN can generate multiple samples of synthetic identity without the\nneed for designing a dedicated architecture or supervision from attribute\nclassifiers. We demonstrate the generalizability and effectiveness of ExFaceGAN\nby integrating it into learned latent spaces of three SOTA GAN approaches. As\nan example of the practical benefit of our ExFaceGAN, we empirically prove that\ndata generated by ExFaceGAN can be successfully used to train face recognition\nmodels (\\url{https://github.com/fdbtrs/ExFaceGAN}).\n","authors":["Fadi Boutros","Marcel Klemt","Meiling Fang","Arjan Kuijper","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2307.05151v2.pdf","comment":"Accepted at IJCB 2023"},{"id":"http://arxiv.org/abs/2110.04830v2","updated":"2023-07-18T21:13:25Z","published":"2021-10-10T15:52:38Z","title":"MARVEL: Raster Manga Vectorization via Primitive-wise Deep Reinforcement\n Learning","summary":" Manga is a fashionable Japanese-style comic form that is composed of\nblack-and-white strokes and is generally displayed as raster images on digital\ndevices. Typical mangas have simple textures, wide lines, and few color\ngradients, which are vectorizable natures to enjoy the merits of vector\ngraphics, e.g., adaptive resolutions and small file sizes. In this paper, we\npropose MARVEL (MAnga's Raster to VEctor Learning), a primitive-wise approach\nfor vectorizing raster mangas by Deep Reinforcement Learning (DRL). Unlike\nprevious learning-based methods which predict vector parameters for an entire\nimage, MARVEL introduces a new perspective that regards an entire manga as a\ncollection of basic primitives\\textemdash stroke lines, and designs a DRL model\nto decompose the target image into a primitive sequence for achieving accurate\nvectorization. To improve vectorization accuracies and decrease file sizes, we\nfurther propose a stroke accuracy reward to predict accurate stroke lines, and\na pruning mechanism to avoid generating erroneous and repeated strokes.\nExtensive subjective and objective experiments show that our MARVEL can\ngenerate impressive results and reaches the state-of-the-art level. Our code is\nopen-source at: https://github.com/SwordHolderSH/Mang2Vec.\n","authors":["Hao Su","Jianwei Niu","Xuefeng Liu","Jiahe Cui","Ji Wan"],"pdf_url":"https://arxiv.org/pdf/2110.04830v2.pdf","comment":"The name of the previous version paper was: Mang2Vec: Vectorization\n of raster manga by deep reinforcement learning"},{"id":"http://arxiv.org/abs/2307.09642v1","updated":"2023-07-18T21:10:59Z","published":"2023-07-18T21:10:59Z","title":"Skin Lesion Correspondence Localization in Total Body Photography","summary":" Longitudinal tracking of skin lesions - finding correspondence, changes in\nmorphology, and texture - is beneficial to the early detection of melanoma.\nHowever, it has not been well investigated in the context of full-body imaging.\nWe propose a novel framework combining geometric and texture information to\nlocalize skin lesion correspondence from a source scan to a target scan in\ntotal body photography (TBP). Body landmarks or sparse correspondence are first\ncreated on the source and target 3D textured meshes. Every vertex on each of\nthe meshes is then mapped to a feature vector characterizing the geodesic\ndistances to the landmarks on that mesh. Then, for each lesion of interest\n(LOI) on the source, its corresponding location on the target is first coarsely\nestimated using the geometric information encoded in the feature vectors and\nthen refined using the texture information. We evaluated the framework\nquantitatively on both a public and a private dataset, for which our success\nrates (at 10 mm criterion) are comparable to the only reported longitudinal\nstudy. As full-body 3D capture becomes more prevalent and has higher quality,\nwe expect the proposed method to constitute a valuable step in the longitudinal\ntracking of skin lesions.\n","authors":["Wei-Lun Huang","Davood Tashayyod","Jun Kang","Amir Gandjbakhche","Michael Kazhdan","Mehran Armand"],"pdf_url":"https://arxiv.org/pdf/2307.09642v1.pdf","comment":"MICCAI-2023"},{"id":"http://arxiv.org/abs/2307.09636v1","updated":"2023-07-18T20:56:41Z","published":"2023-07-18T20:56:41Z","title":"Traffic-Domain Video Question Answering with Automatic Captioning","summary":" Video Question Answering (VidQA) exhibits remarkable potential in\nfacilitating advanced machine reasoning capabilities within the domains of\nIntelligent Traffic Monitoring and Intelligent Transportation Systems.\nNevertheless, the integration of urban traffic scene knowledge into VidQA\nsystems has received limited attention in previous research endeavors. In this\nwork, we present a novel approach termed Traffic-domain Video Question\nAnswering with Automatic Captioning (TRIVIA), which serves as a\nweak-supervision technique for infusing traffic-domain knowledge into large\nvideo-language models. Empirical findings obtained from the SUTD-TrafficQA task\nhighlight the substantial enhancements achieved by TRIVIA, elevating the\naccuracy of representative video-language models by a remarkable 6.5 points\n(19.88%) compared to baseline settings. This pioneering methodology holds great\npromise for driving advancements in the field, inspiring researchers and\npractitioners alike to unlock the full potential of emerging video-language\nmodels in traffic-related applications.\n","authors":["Ehsan Qasemi","Jonathan M. Francis","Alessandro Oltramari"],"pdf_url":"https://arxiv.org/pdf/2307.09636v1.pdf","comment":"Accepted in ITSC2023"},{"id":"http://arxiv.org/abs/2302.02330v2","updated":"2023-07-18T20:54:24Z","published":"2023-02-05T07:50:46Z","title":"CIPER: Combining Invariant and Equivariant Representations Using\n Contrastive and Predictive Learning","summary":" Self-supervised representation learning (SSRL) methods have shown great\nsuccess in computer vision. In recent studies, augmentation-based contrastive\nlearning methods have been proposed for learning representations that are\ninvariant or equivariant to pre-defined data augmentation operations. However,\ninvariant or equivariant features favor only specific downstream tasks\ndepending on the augmentations chosen. They may result in poor performance when\nthe learned representation does not match task requirements. Here, we consider\nan active observer that can manipulate views of an object and has knowledge of\nthe action(s) that generated each view. We introduce Contrastive Invariant and\nPredictive Equivariant Representation learning (CIPER). CIPER comprises both\ninvariant and equivariant learning objectives using one shared encoder and two\ndifferent output heads on top of the encoder. One output head is a projection\nhead with a state-of-the-art contrastive objective to encourage invariance to\naugmentations. The other is a prediction head estimating the augmentation\nparameters, capturing equivariant features. Both heads are discarded after\ntraining and only the encoder is used for downstream tasks. We evaluate our\nmethod on static image tasks and time-augmented image datasets. Our results\nshow that CIPER outperforms a baseline contrastive method on various tasks.\nInterestingly, CIPER encourages the formation of hierarchically structured\nrepresentations where different views of an object become systematically\norganized in the latent representation space.\n","authors":["Xia Xu","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2302.02330v2.pdf","comment":"12 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2306.09618v2","updated":"2023-07-18T20:39:43Z","published":"2023-06-16T04:18:04Z","title":"Emergent Asymmetry of Precision and Recall for Measuring Fidelity and\n Diversity of Generative Models in High Dimensions","summary":" Precision and Recall are two prominent metrics of generative performance,\nwhich were proposed to separately measure the fidelity and diversity of\ngenerative models. Given their central role in comparing and improving\ngenerative models, understanding their limitations are crucially important. To\nthat end, in this work, we identify a critical flaw in the common approximation\nof these metrics using k-nearest-neighbors, namely, that the very\ninterpretations of fidelity and diversity that are assigned to Precision and\nRecall can fail in high dimensions, resulting in very misleading conclusions.\nSpecifically, we empirically and theoretically show that as the number of\ndimensions grows, two model distributions with supports at equal point-wise\ndistance from the support of the real distribution, can have vastly different\nPrecision and Recall regardless of their respective distributions, hence an\nemergent asymmetry in high dimensions. Based on our theoretical insights, we\nthen provide simple yet effective modifications to these metrics to construct\nsymmetric metrics regardless of the number of dimensions. Finally, we provide\nexperiments on real-world datasets to illustrate that the identified flaw is\nnot merely a pathological case, and that our proposed metrics are effective in\nalleviating its impact.\n","authors":["Mahyar Khayatkhoei","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2306.09618v2.pdf","comment":"To appear in ICML 2023. Updated proof in Appendix B"},{"id":"http://arxiv.org/abs/2307.09621v1","updated":"2023-07-18T20:28:31Z","published":"2023-07-18T20:28:31Z","title":"Conditional 360-degree Image Synthesis for Immersive Indoor Scene\n Decoration","summary":" In this paper, we address the problem of conditional scene decoration for\n360-degree images. Our method takes a 360-degree background photograph of an\nindoor scene and generates decorated images of the same scene in the panorama\nview. To do this, we develop a 360-aware object layout generator that learns\nlatent object vectors in the 360-degree view to enable a variety of furniture\narrangements for an input 360-degree background image. We use this object\nlayout to condition a generative adversarial network to synthesize images of an\ninput scene. To further reinforce the generation capability of our model, we\ndevelop a simple yet effective scene emptier that removes the generated\nfurniture and produces an emptied scene for our model to learn a cyclic\nconstraint. We train the model on the Structure3D dataset and show that our\nmodel can generate diverse decorations with controllable object layout. Our\nmethod achieves state-of-the-art performance on the Structure3D dataset and\ngeneralizes well to the Zillow indoor scene dataset. Our user study confirms\nthe immersive experiences provided by the realistic image quality and furniture\nlayout in our generation results. Our implementation will be made available.\n","authors":["Ka Chun Shum","Hong-Wing Pang","Binh-Son Hua","Duc Thanh Nguyen","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2307.09621v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09591v1","updated":"2023-07-18T19:56:20Z","published":"2023-07-18T19:56:20Z","title":"Gradient strikes back: How filtering out high frequencies improves\n explanations","summary":" Recent years have witnessed an explosion in the development of novel\nprediction-based attribution methods, which have slowly been supplanting older\ngradient-based methods to explain the decisions of deep neural networks.\nHowever, it is still not clear why prediction-based methods outperform\ngradient-based ones. Here, we start with an empirical observation: these two\napproaches yield attribution maps with very different power spectra, with\ngradient-based methods revealing more high-frequency content than\nprediction-based methods. This observation raises multiple questions: What is\nthe source of this high-frequency information, and does it truly reflect\ndecisions made by the system? Lastly, why would the absence of high-frequency\ninformation in prediction-based methods yield better explainability scores\nalong multiple metrics? We analyze the gradient of three representative visual\nclassification models and observe that it contains noisy information emanating\nfrom high-frequencies. Furthermore, our analysis reveals that the operations\nused in Convolutional Neural Networks (CNNs) for downsampling appear to be a\nsignificant source of this high-frequency content -- suggesting aliasing as a\npossible underlying basis. We then apply an optimal low-pass filter for\nattribution maps and demonstrate that it improves gradient-based attribution\nmethods. We show that (i) removing high-frequency noise yields significant\nimprovements in the explainability scores obtained with gradient-based methods\nacross multiple models -- leading to (ii) a novel ranking of state-of-the-art\nmethods with gradient-based methods at the top. We believe that our results\nwill spur renewed interest in simpler and computationally more efficient\ngradient-based methods for explainability.\n","authors":["Sabine Muzellec","Leo Andeol","Thomas Fel","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2307.09591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09588v1","updated":"2023-07-18T19:51:28Z","published":"2023-07-18T19:51:28Z","title":"Automating Wood Species Detection and Classification in Microscopic\n Images of Fibrous Materials with Deep Learning","summary":" We have developed a methodology for the systematic generation of a large\nimage dataset of macerated wood references, which we used to generate image\ndata for nine hardwood genera. This is the basis for a substantial approach to\nautomate, for the first time, the identification of hardwood species in\nmicroscopic images of fibrous materials by deep learning. Our methodology\nincludes a flexible pipeline for easy annotation of vessel elements. We compare\nthe performance of different neural network architectures and hyperparameters.\nOur proposed method performs similarly well to human experts. In the future,\nthis will improve controls on global wood fiber product flows to protect\nforests.\n","authors":["Lars Nieradzik","Jördis Sieburg-Rockel","Stephanie Helmling","Janis Keuper","Thomas Weibel","Andrea Olbrich","Henrike Stephani"],"pdf_url":"https://arxiv.org/pdf/2307.09588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09548v1","updated":"2023-07-18T18:47:48Z","published":"2023-07-18T18:47:48Z","title":"Surgical Action Triplet Detection by Mixed Supervised Learning of\n Instrument-Tissue Interactions","summary":" Surgical action triplets describe instrument-tissue interactions as\n(instrument, verb, target) combinations, thereby supporting a detailed analysis\nof surgical scene activities and workflow. This work focuses on surgical action\ntriplet detection, which is challenging but more precise than the traditional\ntriplet recognition task as it consists of joint (1) localization of surgical\ninstruments and (2) recognition of the surgical action triplet associated with\nevery localized instrument. Triplet detection is highly complex due to the lack\nof spatial triplet annotation. We analyze how the amount of instrument spatial\nannotations affects triplet detection and observe that accurate instrument\nlocalization does not guarantee better triplet detection due to the risk of\nerroneous associations with the verbs and targets. To solve the two tasks, we\npropose MCIT-IG, a two-stage network, that stands for Multi-Class\nInstrument-aware Transformer-Interaction Graph. The MCIT stage of our network\nmodels per class embedding of the targets as additional features to reduce the\nrisk of misassociating triplets. Furthermore, the IG stage constructs a\nbipartite dynamic graph to model the interaction between the instruments and\ntargets, cast as the verbs. We utilize a mixed-supervised learning strategy\nthat combines weak target presence labels for MCIT and pseudo triplet labels\nfor IG to train our network. We observed that complementing minimal instrument\nspatial annotations with target embeddings results in better triplet detection.\nWe evaluate our model on the CholecT50 dataset and show improved performance on\nboth instrument localization and triplet detection, topping the leaderboard of\nthe CholecTriplet challenge in MICCAI 2022.\n","authors":["Saurav Sharma","Chinedu Innocent Nwoye","Didier Mutter","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2307.09548v1.pdf","comment":"Accepted at MICCAI, 2023. Project Page:\n https://github.com/CAMMA-public/mcit-ig"},{"id":"http://arxiv.org/abs/2307.09542v1","updated":"2023-07-18T18:36:29Z","published":"2023-07-18T18:36:29Z","title":"Can Neural Network Memorization Be Localized?","summary":" Recent efforts at explaining the interplay of memorization and generalization\nin deep overparametrized networks have posited that neural networks\n$\\textit{memorize}$ \"hard\" examples in the final few layers of the model.\nMemorization refers to the ability to correctly predict on $\\textit{atypical}$\nexamples of the training set. In this work, we show that rather than being\nconfined to individual layers, memorization is a phenomenon confined to a small\nset of neurons in various layers of the model. First, via three experimental\nsources of converging evidence, we find that most layers are redundant for the\nmemorization of examples and the layers that contribute to example memorization\nare, in general, not the final layers. The three sources are $\\textit{gradient\naccounting}$ (measuring the contribution to the gradient norms from memorized\nand clean examples), $\\textit{layer rewinding}$ (replacing specific model\nweights of a converged model with previous training checkpoints), and\n$\\textit{retraining}$ (training rewound layers only on clean examples). Second,\nwe ask a more generic question: can memorization be localized\n$\\textit{anywhere}$ in a model? We discover that memorization is often confined\nto a small number of neurons or channels (around 5) of the model. Based on\nthese insights we propose a new form of dropout -- $\\textit{example-tied\ndropout}$ that enables us to direct the memorization of examples to an apriori\ndetermined set of neurons. By dropping out these neurons, we are able to reduce\nthe accuracy on memorized examples from $100\\%\\to3\\%$, while also reducing the\ngeneralization gap.\n","authors":["Pratyush Maini","Michael C. Mozer","Hanie Sedghi","Zachary C. Lipton","J. Zico Kolter","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09542v1.pdf","comment":"Accepted at ICML 2023"},{"id":"http://arxiv.org/abs/2307.09520v1","updated":"2023-07-18T18:01:30Z","published":"2023-07-18T18:01:30Z","title":"Adversarial Bayesian Augmentation for Single-Source Domain\n Generalization","summary":" Generalizing to unseen image domains is a challenging problem primarily due\nto the lack of diverse training data, inaccessible target data, and the large\ndomain shift that may exist in many real-world settings. As such data\naugmentation is a critical component of domain generalization methods that seek\nto address this problem. We present Adversarial Bayesian Augmentation (ABA), a\nnovel algorithm that learns to generate image augmentations in the challenging\nsingle-source domain generalization setting. ABA draws on the strengths of\nadversarial learning and Bayesian neural networks to guide the generation of\ndiverse data augmentations -- these synthesized image domains aid the\nclassifier in generalizing to unseen domains. We demonstrate the strength of\nABA on several types of domain shift including style shift, subpopulation\nshift, and shift in the medical imaging setting. ABA outperforms all previous\nstate-of-the-art methods, including pre-specified augmentations, pixel-based\nand convolutional-based augmentations.\n","authors":["Sheng Cheng","Tejas Gokhale","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2307.09520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.02307v2","updated":"2023-07-18T17:29:16Z","published":"2023-01-05T21:43:19Z","title":"What You Say Is What You Show: Visual Narration Detection in\n Instructional Videos","summary":" Narrated ''how-to'' videos have emerged as a promising data source for a wide\nrange of learning problems, from learning visual representations to training\nrobot policies. However, this data is extremely noisy, as the narrations do not\nalways describe the actions demonstrated in the video. To address this problem\nwe introduce the novel task of visual narration detection, which entails\ndetermining whether a narration is visually depicted by the actions in the\nvideo. We propose What You Say is What You Show (WYS^2), a method that\nleverages multi-modal cues and pseudo-labeling to learn to detect visual\nnarrations with only weakly labeled data. Our model successfully detects visual\nnarrations in in-the-wild videos, outperforming strong baselines, and we\ndemonstrate its impact for state-of-the-art summarization and temporal\nalignment of instructional videos.\n","authors":["Kumar Ashutosh","Rohit Girdhar","Lorenzo Torresani","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2301.02307v2.pdf","comment":"Technical Report"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.09447v1","updated":"2023-07-18T17:22:19Z","published":"2023-07-18T17:22:19Z","title":"Deep Neural Aggregation for Recommending Items to Group of Users","summary":" Modern society devotes a significant amount of time to digital interaction.\nMany of our daily actions are carried out through digital means. This has led\nto the emergence of numerous Artificial Intelligence tools that assist us in\nvarious aspects of our lives. One key tool for the digital society is\nRecommender Systems, intelligent systems that learn from our past actions to\npropose new ones that align with our interests. Some of these systems have\nspecialized in learning from the behavior of user groups to make\nrecommendations to a group of individuals who want to perform a joint task. In\nthis article, we analyze the current state of Group Recommender Systems and\npropose two new models that use emerging Deep Learning architectures.\nExperimental results demonstrate the improvement achieved by employing the\nproposed models compared to the state-of-the-art models using four different\ndatasets. The source code of the models, as well as that of all the experiments\nconducted, is available in a public repository.\n","authors":["Jorge Dueñas-Lerín","Raúl Lara-Cabrera","Fernando Ortega","Jesús Bobadilla"],"pdf_url":"https://arxiv.org/pdf/2307.09447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09384v1","updated":"2023-07-18T16:05:25Z","published":"2023-07-18T16:05:25Z","title":"Zero-shot Query Reformulation for Conversational Search","summary":" As the popularity of voice assistants continues to surge, conversational\nsearch has gained increased attention in Information Retrieval. However, data\nsparsity issues in conversational search significantly hinder the progress of\nsupervised conversational search methods. Consequently, researchers are\nfocusing more on zero-shot conversational search approaches. Nevertheless,\nexisting zero-shot methods face three primary limitations: they are not\nuniversally applicable to all retrievers, their effectiveness lacks sufficient\nexplainability, and they struggle to resolve common conversational ambiguities\ncaused by omission. To address these limitations, we introduce a novel\nZero-shot Query Reformulation (ZeQR) framework that reformulates queries based\non previous dialogue contexts without requiring supervision from conversational\nsearch data. Specifically, our framework utilizes language models designed for\nmachine reading comprehension tasks to explicitly resolve two common\nambiguities: coreference and omission, in raw queries. In comparison to\nexisting zero-shot methods, our approach is universally applicable to any\nretriever without additional adaptation or indexing. It also provides greater\nexplainability and effectively enhances query intent understanding because\nambiguities are explicitly and proactively resolved. Through extensive\nexperiments on four TREC conversational datasets, we demonstrate the\neffectiveness of our method, which consistently outperforms state-of-the-art\nbaselines.\n","authors":["Dayu Yang","Yue Zhang","Hui Fang"],"pdf_url":"https://arxiv.org/pdf/2307.09384v1.pdf","comment":"Accepted by ICTIR 2023"},{"id":"http://arxiv.org/abs/2307.08669v2","updated":"2023-07-18T14:39:07Z","published":"2023-07-17T17:32:30Z","title":"Leveraging Recommender Systems to Reduce Content Gaps on Peer Production\n Platforms","summary":" Peer production platforms like Wikipedia commonly suffer from content gaps.\nPrior research suggests recommender systems can help solve this problem, by\nguiding editors towards underrepresented topics. However, it remains unclear\nwhether this approach would result in less relevant recommendations, leading to\nreduced overall engagement with recommended items. To answer this question, we\nfirst conducted offline analyses (Study 1) on SuggestBot, a task-routing\nrecommender system for Wikipedia, then did a three-month controlled experiment\n(Study 2). Our results show that presenting users with articles from\nunderrepresented topics increased the proportion of work done on those articles\nwithout significantly reducing overall recommendation uptake. We discuss the\nimplications of our results, including how ignoring the article discovery\nprocess can artificially narrow recommendations. We draw parallels between this\nphenomenon and the common issue of \"filter bubbles\" to show how any platform\nthat employs recommender systems is susceptible to it.\n","authors":["Mo Houtti","Isaac Johnson","Loren Terveen"],"pdf_url":"https://arxiv.org/pdf/2307.08669v2.pdf","comment":"To appear at the 18th International AAAI Conference on Web and Social\n Media (ICWSM 2024)"},{"id":"http://arxiv.org/abs/2307.09193v1","updated":"2023-07-18T12:25:40Z","published":"2023-07-18T12:25:40Z","title":"ESMC: Entire Space Multi-Task Model for Post-Click Conversion Rate via\n Parameter Constraint","summary":" Large-scale online recommender system spreads all over the Internet being in\ncharge of two basic tasks: Click-Through Rate (CTR) and Post-Click Conversion\nRate (CVR) estimations. However, traditional CVR estimators suffer from\nwell-known Sample Selection Bias and Data Sparsity issues. Entire space models\nwere proposed to address the two issues via tracing the decision-making path of\n\"exposure_click_purchase\". Further, some researchers observed that there are\npurchase-related behaviors between click and purchase, which can better draw\nthe user's decision-making intention and improve the recommendation\nperformance. Thus, the decision-making path has been extended to\n\"exposure_click_in-shop action_purchase\" and can be modeled with conditional\nprobability approach. Nevertheless, we observe that the chain rule of\nconditional probability does not always hold. We report Probability Space\nConfusion (PSC) issue and give a derivation of difference between ground-truth\nand estimation mathematically. We propose a novel Entire Space Multi-Task Model\nfor Post-Click Conversion Rate via Parameter Constraint (ESMC) and two\nalternatives: Entire Space Multi-Task Model with Siamese Network (ESMS) and\nEntire Space Multi-Task Model in Global Domain (ESMG) to address the PSC issue.\nSpecifically, we handle \"exposure_click_in-shop action\" and \"in-shop\naction_purchase\" separately in the light of characteristics of in-shop action.\nThe first path is still treated with conditional probability while the second\none is treated with parameter constraint strategy. Experiments on both offline\nand online environments in a large-scale recommendation system illustrate the\nsuperiority of our proposed methods over state-of-the-art models. The\nreal-world datasets will be released.\n","authors":["Zhenhao Jiang","Biao Zeng","Hao Feng","Jin Liu","Jicong Fan","Jie Zhang","Jia Jia","Ning Hu","Xingyu Chen","Xuguang Lan"],"pdf_url":"https://arxiv.org/pdf/2307.09193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09172v1","updated":"2023-07-18T11:49:40Z","published":"2023-07-18T11:49:40Z","title":"Jean-Luc Picard at Touché 2023: Comparing Image Generation, Stance\n Detection and Feature Matching for Image Retrieval for Arguments","summary":" Participating in the shared task \"Image Retrieval for arguments\", we used\ndifferent pipelines for image retrieval containing Image Generation, Stance\nDetection, Preselection and Feature Matching. We submitted four different runs\nwith different pipeline layout and compare them to given baseline. Our\npipelines perform similarly to the baseline.\n","authors":["Max Moebius","Maximilian Enderling","Sarah T. Bachinger"],"pdf_url":"https://arxiv.org/pdf/2307.09172v1.pdf","comment":"7 pages, 1 figure, 1 table, conference: CLEF"},{"id":"http://arxiv.org/abs/2307.09089v1","updated":"2023-07-18T09:16:35Z","published":"2023-07-18T09:16:35Z","title":"Modeling Orders of User Behaviors via Differentiable Sorting: A\n Multi-task Framework to Predicting User Post-click Conversion","summary":" User post-click conversion prediction is of high interest to researchers and\ndevelopers. Recent studies employ multi-task learning to tackle the selection\nbias and data sparsity problem, two severe challenges in post-click behavior\nprediction, by incorporating click data. However, prior works mainly focused on\npointwise learning and the orders of labels (i.e., click and post-click) are\nnot well explored, which naturally poses a listwise learning problem. Inspired\nby recent advances on differentiable sorting, in this paper, we propose a novel\nmulti-task framework that leverages orders of user behaviors to predict user\npost-click conversion in an end-to-end approach. Specifically, we define an\naggregation operator to combine predicted outputs of different tasks to a\nunified score, then we use the computed scores to model the label relations via\ndifferentiable sorting. Extensive experiments on public and industrial datasets\nshow the superiority of our proposed model against competitive baselines.\n","authors":["Menghan Wang","Jinming Yang","Yuchen Guo","Yuming Shen","Mengying Zhu","Yanlin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09089v1.pdf","comment":"The paper is accepted as a short research paper by SIGIR 2023"},{"id":"http://arxiv.org/abs/2305.01548v2","updated":"2023-07-18T09:12:14Z","published":"2023-05-02T15:52:53Z","title":"Explainable Conversational Question Answering over Heterogeneous Sources\n via Iterative Graph Neural Networks","summary":" In conversational question answering, users express their information needs\nthrough a series of utterances with incomplete context. Typical ConvQA methods\nrely on a single source (a knowledge base (KB), or a text corpus, or a set of\ntables), thus being unable to benefit from increased answer coverage and\nredundancy of multiple sources. Our method EXPLAIGNN overcomes these\nlimitations by integrating information from a mixture of sources with\nuser-comprehensible explanations for answers. It constructs a heterogeneous\ngraph from entities and evidence snippets retrieved from a KB, a text corpus,\nweb tables, and infoboxes. This large graph is then iteratively reduced via\ngraph neural networks that incorporate question-level attention, until the best\nanswers and their explanations are distilled. Experiments show that EXPLAIGNN\nimproves performance over state-of-the-art baselines. A user study demonstrates\nthat derived answers are understandable by end users.\n","authors":["Philipp Christmann","Rishiraj Saha Roy","Gerhard Weikum"],"pdf_url":"https://arxiv.org/pdf/2305.01548v2.pdf","comment":"Accepted at SIGIR 2023 (extended version)"},{"id":"http://arxiv.org/abs/2307.08989v1","updated":"2023-07-18T06:01:37Z","published":"2023-07-18T06:01:37Z","title":"GraphCL-DTA: a graph contrastive learning with molecular semantics for\n drug-target binding affinity prediction","summary":" Drug-target binding affinity prediction plays an important role in the early\nstages of drug discovery, which can infer the strength of interactions between\nnew drugs and new targets. However, the performance of previous computational\nmodels is limited by the following drawbacks. The learning of drug\nrepresentation relies only on supervised data, without taking into account the\ninformation contained in the molecular graph itself. Moreover, most previous\nstudies tended to design complicated representation learning module, while\nuniformity, which is used to measure representation quality, is ignored. In\nthis study, we propose GraphCL-DTA, a graph contrastive learning with molecular\nsemantics for drug-target binding affinity prediction. In GraphCL-DTA, we\ndesign a graph contrastive learning framework for molecular graphs to learn\ndrug representations, so that the semantics of molecular graphs are preserved.\nThrough this graph contrastive framework, a more essential and effective drug\nrepresentation can be learned without additional supervised data. Next, we\ndesign a new loss function that can be directly used to smoothly adjust the\nuniformity of drug and target representations. By directly optimizing the\nuniformity of representations, the representation quality of drugs and targets\ncan be improved. The effectiveness of the above innovative elements is verified\non two real datasets, KIBA and Davis. The excellent performance of GraphCL-DTA\non the above datasets suggests its superiority to the state-of-the-art model.\n","authors":["Xinxing Yang","Genke Yang","Jian Chu"],"pdf_url":"https://arxiv.org/pdf/2307.08989v1.pdf","comment":"13 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2306.05118v3","updated":"2023-07-18T03:48:28Z","published":"2023-06-08T11:39:57Z","title":"Controllable Multi-Objective Re-ranking with Policy Hypernetworks","summary":" Multi-stage ranking pipelines have become widely used strategies in modern\nrecommender systems, where the final stage aims to return a ranked list of\nitems that balances a number of requirements such as user preference,\ndiversity, novelty etc. Linear scalarization is arguably the most widely used\ntechnique to merge multiple requirements into one optimization objective, by\nsumming up the requirements with certain preference weights. Existing\nfinal-stage ranking methods often adopt a static model where the preference\nweights are determined during offline training and kept unchanged during online\nserving. Whenever a modification of the preference weights is needed, the model\nhas to be re-trained, which is time and resources inefficient. Meanwhile, the\nmost appropriate weights may vary greatly for different groups of targeting\nusers or at different time periods (e.g., during holiday promotions). In this\npaper, we propose a framework called controllable multi-objective re-ranking\n(CMR) which incorporates a hypernetwork to generate parameters for a re-ranking\nmodel according to different preference weights. In this way, CMR is enabled to\nadapt the preference weights according to the environment changes in an online\nmanner, without retraining the models. Moreover, we classify practical\nbusiness-oriented tasks into four main categories and seamlessly incorporate\nthem in a new proposed re-ranking model based on an Actor-Evaluator framework,\nwhich serves as a reliable real-world testbed for CMR. Offline experiments\nbased on the dataset collected from Taobao App showed that CMR improved several\npopular re-ranking models by using them as underlying models. Online A/B tests\nalso demonstrated the effectiveness and trustworthiness of CMR.\n","authors":["Sirui Chen","Yuan Wang","Zijing Wen","Zhiyu Li","Changshuo Zhang","Xiao Zhang","Quan Lin","Cheng Zhu","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2306.05118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10632v5","updated":"2023-07-18T03:12:59Z","published":"2023-02-21T12:44:17Z","title":"Multi-Modal Self-Supervised Learning for Recommendation","summary":" The online emergence of multi-modal sharing platforms (eg, TikTok, Youtube)\nis powering personalized recommender systems to incorporate various modalities\n(eg, visual, textual and acoustic) into the latent user representations. While\nexisting works on multi-modal recommendation exploit multimedia content\nfeatures in enhancing item embeddings, their model representation capability is\nlimited by heavy label reliance and weak robustness on sparse user behavior\ndata. Inspired by the recent progress of self-supervised learning in\nalleviating label scarcity issue, we explore deriving self-supervision signals\nwith effectively learning of modality-aware user preference and cross-modal\ndependencies. To this end, we propose a new Multi-Modal Self-Supervised\nLearning (MMSSL) method which tackles two key challenges. Specifically, to\ncharacterize the inter-dependency between the user-item collaborative view and\nitem multi-modal semantic view, we design a modality-aware interactive\nstructure learning paradigm via adversarial perturbations for data\naugmentation. In addition, to capture the effects that user's modality-aware\ninteraction pattern would interweave with each other, a cross-modal contrastive\nlearning approach is introduced to jointly preserve the inter-modal semantic\ncommonality and user preference diversity. Experiments on real-world datasets\nverify the superiority of our method in offering great potential for multimedia\nrecommendation over various state-of-the-art baselines. The implementation is\nreleased at: https://github.com/HKUDS/MMSSL.\n","authors":["Wei Wei","Chao Huang","Lianghao Xia","Chuxu Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.10632v5.pdf","comment":"This paper has been published as a full paper at WWW 2023"},{"id":"http://arxiv.org/abs/2306.07528v2","updated":"2023-07-18T02:18:17Z","published":"2023-06-13T03:46:22Z","title":"Unified Off-Policy Learning to Rank: a Reinforcement Learning\n Perspective","summary":" Off-policy Learning to Rank (LTR) aims to optimize a ranker from data\ncollected by a deployed logging policy. However, existing off-policy learning\nto rank methods often make strong assumptions about how users generate the\nclick data, i.e., the click model, and hence need to tailor their methods\nspecifically under different click models. In this paper, we unified the\nranking process under general stochastic click models as a Markov Decision\nProcess (MDP), and the optimal ranking could be learned with offline\nreinforcement learning (RL) directly. Building upon this, we leverage offline\nRL techniques for off-policy LTR and propose the Click Model-Agnostic Unified\nOff-policy Learning to Rank (CUOLR) method, which could be easily applied to a\nwide range of click models. Through a dedicated formulation of the MDP, we show\nthat offline RL algorithms can adapt to various click models without complex\ndebiasing techniques and prior knowledge of the model. Results on various\nlarge-scale datasets demonstrate that CUOLR consistently outperforms the\nstate-of-the-art off-policy learning to rank algorithms while maintaining\nconsistency and robustness under different click models.\n","authors":["Zeyu Zhang","Yi Su","Hui Yuan","Yiran Wu","Rishab Balasubramanian","Qingyun Wu","Huazheng Wang","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2306.07528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08910v1","updated":"2023-07-18T01:02:20Z","published":"2023-07-18T01:02:20Z","title":"Sharpness-Aware Graph Collaborative Filtering","summary":" Graph Neural Networks (GNNs) have achieved impressive performance in\ncollaborative filtering. However, GNNs tend to yield inferior performance when\nthe distributions of training and test data are not aligned well. Also,\ntraining GNNs requires optimizing non-convex neural networks with an abundance\nof local and global minima, which may differ widely in their performance at\ntest time. Thus, it is essential to choose the minima carefully. Here we\npropose an effective training schema, called {gSAM}, under the principle that\nthe \\textit{flatter} minima has a better generalization ability than the\n\\textit{sharper} ones. To achieve this goal, gSAM regularizes the flatness of\nthe weight loss landscape by forming a bi-level optimization: the outer problem\nconducts the standard model training while the inner problem helps the model\njump out of the sharp minima. Experimental results show the superiority of our\ngSAM.\n","authors":["Huiyuan Chen","Chin-Chia Michael Yeh","Yujie Fan","Yan Zheng","Junpeng Wang","Vivian Lai","Mahashweta Das","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2307.08910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09683v1","updated":"2023-07-18T23:35:53Z","published":"2023-07-18T23:35:53Z","title":"PubMed and Beyond: Recent Advances and Best Practices in Biomedical\n Literature Search","summary":" Biomedical research yields a wealth of information, much of which is only\naccessible through the literature. Consequently, literature search is an\nessential tool for building on prior knowledge in clinical and biomedical\nresearch. Although recent improvements in artificial intelligence have expanded\nfunctionality beyond keyword-based search, these advances may be unfamiliar to\nclinicians and researchers. In response, we present a survey of literature\nsearch tools tailored to both general and specific information needs in\nbiomedicine, with the objective of helping readers efficiently fulfill their\ninformation needs. We first examine the widely used PubMed search engine,\ndiscussing recent improvements and continued challenges. We then describe\nliterature search tools catering to five specific information needs: 1.\nIdentifying high-quality clinical research for evidence-based medicine. 2.\nRetrieving gene-related information for precision medicine and genomics. 3.\nSearching by meaning, including natural language questions. 4. Locating related\narticles with literature recommendation. 5. Mining literature to discover\nassociations between concepts such as diseases and genetic variants.\nAdditionally, we cover practical considerations and best practices for choosing\nand using these tools. Finally, we provide a perspective on the future of\nliterature search engines, considering recent breakthroughs in large language\nmodels such as ChatGPT. In summary, our survey provides a comprehensive view of\nbiomedical literature search functionalities with 36 publicly available tools.\n","authors":["Qiao Jin","Robert Leaman","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.09683v1.pdf","comment":"27 pages, 6 figures, 36 tools"},{"id":"http://arxiv.org/abs/2307.11772v1","updated":"2023-07-18T04:43:24Z","published":"2023-07-18T04:43:24Z","title":"AutoAlign: Fully Automatic and Effective Knowledge Graph Alignment\n enabled by Large Language Models","summary":" The task of entity alignment between knowledge graphs (KGs) aims to identify\nevery pair of entities from two different KGs that represent the same entity.\nMany machine learning-based methods have been proposed for this task. However,\nto our best knowledge, existing methods all require manually crafted seed\nalignments, which are expensive to obtain. In this paper, we propose the first\nfully automatic alignment method named AutoAlign, which does not require any\nmanually crafted seed alignments. Specifically, for predicate embeddings,\nAutoAlign constructs a predicate-proximity-graph with the help of large\nlanguage models to automatically capture the similarity between predicates\nacross two KGs. For entity embeddings, AutoAlign first computes the entity\nembeddings of each KG independently using TransE, and then shifts the two KGs'\nentity embeddings into the same vector space by computing the similarity\nbetween entities based on their attributes. Thus, both predicate alignment and\nentity alignment can be done without manually crafted seed alignments.\nAutoAlign is not only fully automatic, but also highly effective. Experiments\nusing real-world KGs show that AutoAlign improves the performance of entity\nalignment significantly compared to state-of-the-art methods.\n","authors":["Rui Zhang","Yixin Su","Bayu Distiawan Trisedya","Xiaoyan Zhao","Min Yang","Hong Cheng","Jianzhong Qi"],"pdf_url":"https://arxiv.org/pdf/2307.11772v1.pdf","comment":"14 pages, 5 figures, 4 tables. arXiv admin note: substantial text\n overlap with arXiv:2210.08540"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.09483v1","updated":"2023-07-18T17:59:25Z","published":"2023-07-18T17:59:25Z","title":"Forecasting the steam mass flow in a powerplant using the parallel\n hybrid network","summary":" Efficient and sustainable power generation is a crucial concern in the energy\nsector. In particular, thermal power plants grapple with accurately predicting\nsteam mass flow, which is crucial for operational efficiency and cost\nreduction. In this study, we use a parallel hybrid neural network architecture\nthat combines a parametrized quantum circuit and a conventional feed-forward\nneural network specifically designed for time-series prediction in industrial\nsettings to enhance predictions of steam mass flow 15 minutes into the future.\nOur results show that the parallel hybrid model outperforms standalone\nclassical and quantum models, achieving more than 5.7 and 4.9 times lower mean\nsquared error (MSE) loss on the test set after training compared to pure\nclassical and pure quantum networks, respectively. Furthermore, the hybrid\nmodel demonstrates smaller relative errors between the ground truth and the\nmodel predictions on the test set, up to 2 times better than the pure classical\nmodel. These findings contribute to the broader scientific understanding of how\nintegrating quantum and classical machine learning techniques can be applied to\nreal-world challenges faced by the energy sector, ultimately leading to\noptimized power plant operations.\n","authors":["Andrii Kurkin","Jonas Hegemann","Mo Kordzanganeh","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2307.09483v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.09476v1","updated":"2023-07-18T17:56:50Z","published":"2023-07-18T17:56:50Z","title":"Overthinking the Truth: Understanding how Language Models Process False\n Demonstrations","summary":" Modern language models can imitate complex patterns through few-shot\nlearning, enabling them to complete challenging tasks without fine-tuning.\nHowever, imitation can also lead models to reproduce inaccuracies or harmful\ncontent if present in the context. We study harmful imitation through the lens\nof a model's internal representations, and identify two related phenomena:\noverthinking and false induction heads. The first phenomenon, overthinking,\nappears when we decode predictions from intermediate layers, given correct vs.\nincorrect few-shot demonstrations. At early layers, both demonstrations induce\nsimilar model behavior, but the behavior diverges sharply at some \"critical\nlayer\", after which the accuracy given incorrect demonstrations progressively\ndecreases. The second phenomenon, false induction heads, are a possible\nmechanistic cause of overthinking: these are heads in late layers that attend\nto and copy false information from previous demonstrations, and whose ablation\nreduces overthinking. Beyond scientific understanding, our results suggest that\nstudying intermediate model computations could be a promising avenue for\nunderstanding and guarding against harmful model behaviors.\n","authors":["Danny Halawi","Jean-Stanislas Denain","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2307.09476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15656v3","updated":"2023-07-18T17:52:28Z","published":"2023-06-27T17:50:26Z","title":"SparseOptimizer: Sparsify Language Models through Moreau-Yosida\n Regularization and Accelerate via Compiler Co-design","summary":" This paper introduces SparseOptimizer, a novel deep learning optimizer that\nexploits Moreau-Yosida regularization to naturally induce sparsity in large\nlanguage models such as BERT, ALBERT and GPT. Key to the design of\nSparseOptimizer is an embedded shrinkage operator, which imparts sparsity\ndirectly within the optimization process. This operator, backed by a sound\ntheoretical framework, includes an analytical solution, thereby reinforcing the\noptimizer's robustness and efficacy. Crucially, SparseOptimizer's plug-and-play\nfunctionality eradicates the need for code modifications, making it a\nuniversally adaptable tool for a wide array of large language models. Empirical\nevaluations on benchmark datasets such as GLUE, RACE, SQuAD1, and SQuAD2\nconfirm that SparseBERT and SparseALBERT, when sparsified using\nSparseOptimizer, achieve performance comparable to their dense counterparts,\nBERT and ALBERT, while significantly reducing their parameter count. Further,\nthis work proposes an innovative optimizer-compiler co-design strategy,\ndemonstrating the potential of inference acceleration (\\textbf{3.37x},\n\\textbf{6.30x}, and \\textbf{7.15x} in comparison with Pytorch, TensorFlow, and\nLLVM generic compile, respectively) in SparseBERT when paired with an\nappropriately designed compiler. This study represents a significant step\nforward in the evolution of efficient, scalable, and high-performing large\nlanguage models, setting a precedent for future exploration and optimization in\nthis domain. The SparseOptimizer code and SparseALBERT model will be publicly\navailable upon paper acceptance.\n","authors":["Fu-Ming Guo"],"pdf_url":"https://arxiv.org/pdf/2306.15656v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03181v2","updated":"2023-07-18T17:50:22Z","published":"2022-11-30T19:38:21Z","title":"Funnel-based Reward Shaping for Signal Temporal Logic Tasks in\n Reinforcement Learning","summary":" Signal Temporal Logic (STL) is a powerful framework for describing the\ncomplex temporal and logical behaviour of the dynamical system. Numerous\nstudies have attempted to employ reinforcement learning to learn a controller\nthat enforces STL specifications; however, they have been unable to effectively\ntackle the challenges of ensuring robust satisfaction in continuous state space\nand maintaining tractability. In this paper, leveraging the concept of funnel\nfunctions, we propose a tractable reinforcement learning algorithm to learn a\ntime-dependent policy for robust satisfaction of STL specification in\ncontinuous state space. We demonstrate the utility of our approach on several\nSTL tasks using different environments.\n","authors":["Naman Saxena","Gorantla Sandeep","Pushpak Jagtap"],"pdf_url":"https://arxiv.org/pdf/2212.03181v2.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.09463v1","updated":"2023-07-18T17:46:33Z","published":"2023-07-18T17:46:33Z","title":"A Cryogenic Memristive Neural Decoder for Fault-tolerant Quantum Error\n Correction","summary":" Neural decoders for quantum error correction (QEC) rely on neural networks to\nclassify syndromes extracted from error correction codes and find appropriate\nrecovery operators to protect logical information against errors. Despite the\ngood performance of neural decoders, important practical requirements remain to\nbe achieved, such as minimizing the decoding time to meet typical rates of\nsyndrome generation in repeated error correction schemes, and ensuring the\nscalability of the decoding approach as the code distance increases. Designing\na dedicated integrated circuit to perform the decoding task in co-integration\nwith a quantum processor appears necessary to reach these decoding time and\nscalability requirements, as routing signals in and out of a cryogenic\nenvironment to be processed externally leads to unnecessary delays and an\neventual wiring bottleneck. In this work, we report the design and performance\nanalysis of a neural decoder inference accelerator based on an in-memory\ncomputing (IMC) architecture, where crossbar arrays of resistive memory devices\nare employed to both store the synaptic weights of the decoder neural network\nand perform analog matrix-vector multiplications during inference. In\nproof-of-concept numerical experiments supported by experimental measurements,\nwe investigate the impact of TiO$_\\textrm{x}$-based memristive devices'\nnon-idealities on decoding accuracy. Hardware-aware training methods are\ndeveloped to mitigate the loss in accuracy, allowing the memristive neural\ndecoders to achieve a pseudo-threshold of $9.23\\times 10^{-4}$ for the\ndistance-three surface code, whereas the equivalent digital neural decoder\nachieves a pseudo-threshold of $1.01\\times 10^{-3}$. This work provides a\npathway to scalable, fast, and low-power cryogenic IMC hardware for integrated\nQEC.\n","authors":["Frédéric Marcotte","Pierre-Antoine Mouny","Victor Yon","Gebremedhin A. Dagnew","Bohdan Kulchytskyy","Sophie Rochette","Yann Beilliard","Dominique Drouin","Pooya Ronagh"],"pdf_url":"https://arxiv.org/pdf/2307.09463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06289v3","updated":"2023-07-18T17:39:49Z","published":"2023-03-11T02:56:29Z","title":"Machine Learning Enhanced Hankel Dynamic-Mode Decomposition","summary":" While the acquisition of time series has become more straightforward,\ndeveloping dynamical models from time series is still a challenging and\nevolving problem domain. Within the last several years, to address this\nproblem, there has been a merging of machine learning tools with what is called\nthe dynamic mode decomposition (DMD). This general approach has been shown to\nbe an especially promising avenue for accurate model development. Building on\nthis prior body of work, we develop a deep learning DMD based method which\nmakes use of the fundamental insight of Takens' Embedding Theorem to build an\nadaptive learning scheme that better approximates higher dimensional and\nchaotic dynamics. We call this method the Deep Learning Hankel DMD (DLHDMD). We\nlikewise explore how our method learns mappings which tend, after successful\ntraining, to significantly change the mutual information between dimensions in\nthe dynamics. This appears to be a key feature in enhancing the DMD overall,\nand it should help provide further insight for developing other deep learning\nmethods for time series analysis and model generation.\n","authors":["Christopher W. Curtis","D. Jay Alford-Lago","Erik Bollt","Andrew Tuma"],"pdf_url":"https://arxiv.org/pdf/2303.06289v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09458v1","updated":"2023-07-18T17:39:04Z","published":"2023-07-18T17:39:04Z","title":"Does Circuit Analysis Interpretability Scale? Evidence from Multiple\n Choice Capabilities in Chinchilla","summary":" \\emph{Circuit analysis} is a promising technique for understanding the\ninternal mechanisms of language models. However, existing analyses are done in\nsmall models far from the state of the art. To address this, we present a case\nstudy of circuit analysis in the 70B Chinchilla model, aiming to test the\nscalability of circuit analysis. In particular, we study multiple-choice\nquestion answering, and investigate Chinchilla's capability to identify the\ncorrect answer \\emph{label} given knowledge of the correct answer \\emph{text}.\nWe find that the existing techniques of logit attribution, attention pattern\nvisualization, and activation patching naturally scale to Chinchilla, allowing\nus to identify and categorize a small set of `output nodes' (attention heads\nand MLPs).\n We further study the `correct letter' category of attention heads aiming to\nunderstand the semantics of their features, with mixed results. For normal\nmultiple-choice question answers, we significantly compress the query, key and\nvalue subspaces of the head without loss of performance when operating on the\nanswer labels for multiple-choice questions, and we show that the query and key\nsubspaces represent an `Nth item in an enumeration' feature to at least some\nextent. However, when we attempt to use this explanation to understand the\nheads' behaviour on a more general distribution including randomized answer\nlabels, we find that it is only a partial explanation, suggesting there is more\nto learn about the operation of `correct letter' heads on multiple choice\nquestion answering.\n","authors":["Tom Lieberum","Matthew Rahtz","János Kramár","Geoffrey Irving","Rohin Shah","Vladimir Mikulik"],"pdf_url":"https://arxiv.org/pdf/2307.09458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09457v1","updated":"2023-07-18T17:38:04Z","published":"2023-07-18T17:38:04Z","title":"Smooth Attention for Deep Multiple Instance Learning: Application to CT\n Intracranial Hemorrhage Detection","summary":" Multiple Instance Learning (MIL) has been widely applied to medical imaging\ndiagnosis, where bag labels are known and instance labels inside bags are\nunknown. Traditional MIL assumes that instances in each bag are independent\nsamples from a given distribution. However, instances are often spatially or\nsequentially ordered, and one would expect similar diagnostic importance for\nneighboring instances. To address this, in this study, we propose a smooth\nattention deep MIL (SA-DMIL) model. Smoothness is achieved by the introduction\nof first and second order constraints on the latent function encoding the\nattention paid to each instance in a bag. The method is applied to the\ndetection of intracranial hemorrhage (ICH) on head CT scans. The results show\nthat this novel SA-DMIL: (a) achieves better performance than the non-smooth\nattention MIL at both scan (bag) and slice (instance) levels; (b) learns\nspatial dependencies between slices; and (c) outperforms current\nstate-of-the-art MIL methods on the same ICH test set.\n","authors":["Yunan Wu","Francisco M. Castro-Macías","Pablo Morales-Álvarez","Rafael Molina","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2307.09457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06915v2","updated":"2023-07-18T17:28:02Z","published":"2023-07-13T17:29:01Z","title":"Weighted Averaged Stochastic Gradient Descent: Asymptotic Normality and\n Optimality","summary":" Stochastic Gradient Descent (SGD) is one of the simplest and most popular\nalgorithms in modern statistical and machine learning due to its computational\nand memory efficiency. Various averaging schemes have been proposed to\naccelerate the convergence of SGD in different settings. In this paper, we\nexplore a general averaging scheme for SGD. Specifically, we establish the\nasymptotic normality of a broad range of weighted averaged SGD solutions and\nprovide asymptotically valid online inference approaches. Furthermore, we\npropose an adaptive averaging scheme that exhibits both optimal statistical\nrate and favorable non-asymptotic convergence, drawing insights from the\noptimal weight for the linear model in terms of non-asymptotic mean squared\nerror (MSE).\n","authors":["Ziyang Wei","Wanrong Zhu","Wei Biao Wu"],"pdf_url":"https://arxiv.org/pdf/2307.06915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09441v1","updated":"2023-07-18T17:16:08Z","published":"2023-07-18T17:16:08Z","title":"Convergent regularization in inverse problems and linear plug-and-play\n denoisers","summary":" Plug-and-play (PnP) denoising is a popular iterative framework for solving\nimaging inverse problems using off-the-shelf image denoisers. Their empirical\nsuccess has motivated a line of research that seeks to understand the\nconvergence of PnP iterates under various assumptions on the denoiser. While a\nsignificant amount of research has gone into establishing the convergence of\nthe PnP iteration for different regularity conditions on the denoisers, not\nmuch is known about the asymptotic properties of the converged solution as the\nnoise level in the measurement tends to zero, i.e., whether PnP methods are\nprovably convergent regularization schemes under reasonable assumptions on the\ndenoiser. This paper serves two purposes: first, we provide an overview of the\nclassical regularization theory in inverse problems and survey a few notable\nrecent data-driven methods that are provably convergent regularization schemes.\nWe then continue to discuss PnP algorithms and their established convergence\nguarantees. Subsequently, we consider PnP algorithms with linear denoisers and\npropose a novel spectral filtering technique to control the strength of\nregularization arising from the denoiser. Further, by relating the implicit\nregularization of the denoiser to an explicit regularization functional, we\nrigorously show that PnP with linear denoisers leads to a convergent\nregularization scheme. More specifically, we prove that in the limit as the\nnoise vanishes, the PnP reconstruction converges to the minimizer of a\nregularization potential subject to the solution satisfying the noiseless\noperator equation. The theoretical analysis is corroborated by numerical\nexperiments for the classical inverse problem of tomographic image\nreconstruction.\n","authors":["Andreas Hauptmann","Subhadip Mukherjee","Carola-Bibiane Schönlieb","Ferdia Sherry"],"pdf_url":"https://arxiv.org/pdf/2307.09441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09437v1","updated":"2023-07-18T17:11:55Z","published":"2023-07-18T17:11:55Z","title":"Unsupervised Conditional Slot Attention for Object Centric Learning","summary":" Extracting object-level representations for downstream reasoning tasks is an\nemerging area in AI. Learning object-centric representations in an unsupervised\nsetting presents multiple challenges, a key one being binding an arbitrary\nnumber of object instances to a specialized object slot. Recent object-centric\nrepresentation methods like Slot Attention utilize iterative attention to learn\ncomposable representations with dynamic inference level binding but fail to\nachieve specialized slot level binding. To address this, in this paper we\npropose Unsupervised Conditional Slot Attention using a novel Probabilistic\nSlot Dictionary (PSD). We define PSD with (i) abstract object-level property\nvectors as key and (ii) parametric Gaussian distribution as its corresponding\nvalue. We demonstrate the benefits of the learnt specific object-level\nconditioning distributions in multiple downstream tasks, namely object\ndiscovery, compositional scene generation, and compositional visual reasoning.\nWe show that our method provides scene composition capabilities and a\nsignificant boost in a few shot adaptability tasks of compositional visual\nreasoning, while performing similarly or better than slot attention in object\ndiscovery tasks\n","authors":["Avinash Kori","Francesco Locatello","Francesca Toni","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2307.09437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13816v3","updated":"2023-07-18T16:49:52Z","published":"2023-01-31T18:02:26Z","title":"Execution-based Code Generation using Deep Reinforcement Learning","summary":" The utilization of programming language (PL) models, pre-trained on\nlarge-scale code corpora, as a means of automating software engineering\nprocesses has demonstrated considerable potential in streamlining various code\ngeneration tasks such as code completion, code translation, and program\nsynthesis. However, current approaches mainly rely on supervised fine-tuning\nobjectives borrowed from text generation, neglecting unique sequence-level\ncharacteristics of code, including but not limited to compilability as well as\nsyntactic and functional correctness. To address this limitation, we propose\nPPOCoder, a new framework for code generation that synergistically combines\npre-trained PL models with Proximal Policy Optimization (PPO) which is a widely\nused deep reinforcement learning technique. By utilizing non-differentiable\nfeedback from code execution and structure alignment, PPOCoder seamlessly\nintegrates external code-specific knowledge into the model optimization\nprocess. It's important to note that PPOCoder is a task-agnostic and\nmodel-agnostic framework that can be used across different code generation\ntasks and PLs. Extensive experiments on three code generation tasks demonstrate\nthe effectiveness of our proposed approach compared to SOTA methods, achieving\nsignificant improvements in compilation success rates and functional\ncorrectness across different PLs.\n","authors":["Parshin Shojaee","Aneesh Jain","Sindhu Tipirneni","Chandan K. Reddy"],"pdf_url":"https://arxiv.org/pdf/2301.13816v3.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR), 2023"},{"id":"http://arxiv.org/abs/2307.09423v1","updated":"2023-07-18T16:43:03Z","published":"2023-07-18T16:43:03Z","title":"Scaling Laws for Imitation Learning in NetHack","summary":" Imitation Learning (IL) is one of the most widely used methods in machine\nlearning. Yet, while powerful, many works find it is often not able to fully\nrecover the underlying expert behavior. However, none of these works deeply\ninvestigate the role of scaling up the model and data size. Inspired by recent\nwork in Natural Language Processing (NLP) where \"scaling up\" has resulted in\nincreasingly more capable LLMs, we investigate whether carefully scaling up\nmodel and data size can bring similar improvements in the imitation learning\nsetting. To demonstrate our findings, we focus on the game of NetHack, a\nchallenging environment featuring procedural generation, stochasticity,\nlong-term dependencies, and partial observability. We find IL loss and mean\nreturn scale smoothly with the compute budget and are strongly correlated,\nresulting in power laws for training compute-optimal IL agents with respect to\nmodel size and number of samples. We forecast and train several NetHack agents\nwith IL and find they outperform prior state-of-the-art by at least 2x in all\nsettings. Our work both demonstrates the scaling behavior of imitation learning\nin a challenging domain, as well as the viability of scaling up current\napproaches for increasingly capable agents in NetHack, a game that remains\nelusively hard for current AI systems.\n","authors":["Jens Tuyls","Dhruv Madeka","Kari Torkkola","Dean Foster","Karthik Narasimhan","Sham Kakade"],"pdf_url":"https://arxiv.org/pdf/2307.09423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06849v2","updated":"2023-07-18T16:20:43Z","published":"2023-06-12T03:47:43Z","title":"Mitigating Transformer Overconfidence via Lipschitz Regularization","summary":" Though Transformers have achieved promising results in many computer vision\ntasks, they tend to be over-confident in predictions, as the standard Dot\nProduct Self-Attention (DPSA) can barely preserve distance for the unbounded\ninput domain. In this work, we fill this gap by proposing a novel Lipschitz\nRegularized Transformer (LRFormer). Specifically, we present a new similarity\nfunction with the distance within Banach Space to ensure the Lipschitzness and\nalso regularize the term by a contractive Lipschitz Bound. The proposed method\nis analyzed with a theoretical guarantee, providing a rigorous basis for its\neffectiveness and reliability. Extensive experiments conducted on standard\nvision benchmarks demonstrate that our method outperforms the state-of-the-art\nsingle forward pass approaches in prediction, calibration, and uncertainty\nestimation.\n","authors":["Wenqian Ye","Yunsheng Ma","Xu Cao","Kun Tang"],"pdf_url":"https://arxiv.org/pdf/2306.06849v2.pdf","comment":"Accepted by UAI 2023. (https://proceedings.mlr.press/v216/ye23a.html)"},{"id":"http://arxiv.org/abs/2307.09388v1","updated":"2023-07-18T16:13:35Z","published":"2023-07-18T16:13:35Z","title":"Online Learning with Costly Features in Non-stationary Environments","summary":" Maximizing long-term rewards is the primary goal in sequential\ndecision-making problems. The majority of existing methods assume that side\ninformation is freely available, enabling the learning agent to observe all\nfeatures' states before making a decision. In real-world problems, however,\ncollecting beneficial information is often costly. That implies that, besides\nindividual arms' reward, learning the observations of the features' states is\nessential to improve the decision-making strategy. The problem is aggravated in\na non-stationary environment where reward and cost distributions undergo abrupt\nchanges over time. To address the aforementioned dual learning problem, we\nextend the contextual bandit setting and allow the agent to observe subsets of\nfeatures' states. The objective is to maximize the long-term average gain,\nwhich is the difference between the accumulated rewards and the paid costs on\naverage. Therefore, the agent faces a trade-off between minimizing the cost of\ninformation acquisition and possibly improving the decision-making process\nusing the obtained information. To this end, we develop an algorithm that\nguarantees a sublinear regret in time. Numerical results demonstrate the\nsuperiority of our proposed policy in a real-world scenario.\n","authors":["Saeed Ghoorchian","Evgenii Kortukov","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2307.09388v1.pdf","comment":"31 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.03829v3","updated":"2023-07-18T16:04:42Z","published":"2023-05-05T20:08:40Z","title":"Improving Image-Based Precision Medicine with Uncertainty-Aware Causal\n Models","summary":" Image-based precision medicine aims to personalize treatment decisions based\non an individual's unique imaging features so as to improve their clinical\noutcome. Machine learning frameworks that integrate uncertainty estimation as\npart of their treatment recommendations would be safer and more reliable.\nHowever, little work has been done in adapting uncertainty estimation\ntechniques and validation metrics for precision medicine. In this paper, we use\nBayesian deep learning for estimating the posterior distribution over factual\nand counterfactual outcomes on several treatments. This allows for estimating\nthe uncertainty for each treatment option and for the individual treatment\neffects (ITE) between any two treatments. We train and evaluate this model to\npredict future new and enlarging T2 lesion counts on a large, multi-center\ndataset of MR brain images of patients with multiple sclerosis, exposed to\nseveral treatments during randomized controlled trials. We evaluate the\ncorrelation of the uncertainty estimate with the factual error, and, given the\nlack of ground truth counterfactual outcomes, demonstrate how uncertainty for\nthe ITE prediction relates to bounds on the ITE error. Lastly, we demonstrate\nhow knowledge of uncertainty could modify clinical decision-making to improve\nindividual patient and clinical trial outcomes.\n","authors":["Joshua Durso-Finley","Jean-Pierre Falet","Raghav Mehta","Douglas L. Arnold","Nick Pawlowski","Tal Arbel"],"pdf_url":"https://arxiv.org/pdf/2305.03829v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09379v1","updated":"2023-07-18T16:01:01Z","published":"2023-07-18T16:01:01Z","title":"Batched Predictors Generalize within Distribution","summary":" We study the generalization properties of batched predictors, i.e., models\ntasked with predicting the mean label of a small set (or batch) of examples.\nThe batched prediction paradigm is particularly relevant for models deployed to\ndetermine the quality of a group of compounds in preparation for offline\ntesting. By utilizing a suitable generalization of the Rademacher complexity,\nwe prove that batched predictors come with exponentially stronger\ngeneralization guarantees as compared to the standard per-sample approach.\nSurprisingly, the proposed bound holds independently of overparametrization.\nOur theoretical insights are validated experimentally for various tasks,\narchitectures, and applications.\n","authors":["Andreas Loukas","Pan Kessel"],"pdf_url":"https://arxiv.org/pdf/2307.09379v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.09377v1","updated":"2023-07-18T16:00:02Z","published":"2023-07-18T16:00:02Z","title":"Data Cross-Segmentation for Improved Generalization in Reinforcement\n Learning Based Algorithmic Trading","summary":" The use of machine learning in algorithmic trading systems is increasingly\ncommon. In a typical set-up, supervised learning is used to predict the future\nprices of assets, and those predictions drive a simple trading and execution\nstrategy. This is quite effective when the predictions have sufficient signal,\nmarkets are liquid, and transaction costs are low. However, those conditions\noften do not hold in thinly traded financial markets and markets for\ndifferentiated assets such as real estate or vehicles. In these markets, the\ntrading strategy must consider the long-term effects of taking positions that\nare relatively more difficult to change. In this work, we propose a\nReinforcement Learning (RL) algorithm that trades based on signals from a\nlearned predictive model and addresses these challenges. We test our algorithm\non 20+ years of equity data from Bursa Malaysia.\n","authors":["Vikram Duvvur","Aashay Mehta","Edward Sun","Bo Wu","Ken Yew Chan","Jeff Schneider"],"pdf_url":"https://arxiv.org/pdf/2307.09377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09372v1","updated":"2023-07-18T15:56:39Z","published":"2023-07-18T15:56:39Z","title":"Enhancing Pattern Classification in Support Vector Machines through\n Matrix Formulation","summary":" Support Vector Machines (SVM) have gathered significant acclaim as\nclassifiers due to their successful implementation of Statistical Learning\nTheory. However, in the context of multiclass and multilabel settings, the\nreliance on vector-based formulations in existing SVM-based models poses\nlimitations regarding flexibility and ease of incorporating additional terms to\nhandle specific challenges. To overcome these limitations, our research paper\nfocuses on introducing a matrix formulation for SVM that effectively addresses\nthese constraints. By employing the Accelerated Gradient Descent method in the\ndual, we notably enhance the efficiency of solving the Matrix-SVM problem.\nExperimental evaluations on multilabel and multiclass datasets demonstrate that\nMatrix SVM achieves superior time efficacy while delivering similar results to\nBinary Relevance SVM.\n Moreover, our matrix formulation unveils crucial insights and advantages that\nmay not be readily apparent in traditional vector-based notations. We emphasize\nthat numerous multilabel models can be viewed as extensions of SVM, with\ncustomised modifications to meet specific requirements. The matrix formulation\npresented in this paper establishes a solid foundation for developing more\nsophisticated models capable of effectively addressing the distinctive\nchallenges encountered in multilabel learning.\n","authors":["Sambhav Jain Reshma Rastogi"],"pdf_url":"https://arxiv.org/pdf/2307.09372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09366v1","updated":"2023-07-18T15:49:02Z","published":"2023-07-18T15:49:02Z","title":"Sparse Gaussian Graphical Models with Discrete Optimization:\n Computational and Statistical Perspectives","summary":" We consider the problem of learning a sparse graph underlying an undirected\nGaussian graphical model, a key problem in statistical machine learning. Given\n$n$ samples from a multivariate Gaussian distribution with $p$ variables, the\ngoal is to estimate the $p \\times p$ inverse covariance matrix (aka precision\nmatrix), assuming it is sparse (i.e., has a few nonzero entries). We propose\nGraphL0BnB, a new estimator based on an $\\ell_0$-penalized version of the\npseudolikelihood function, while most earlier approaches are based on the\n$\\ell_1$-relaxation. Our estimator can be formulated as a convex mixed integer\nprogram (MIP) which can be difficult to compute at scale using off-the-shelf\ncommercial solvers. To solve the MIP, we propose a custom nonlinear\nbranch-and-bound (BnB) framework that solves node relaxations with tailored\nfirst-order methods. As a by-product of our BnB framework, we propose\nlarge-scale solvers for obtaining good primal solutions that are of independent\ninterest. We derive novel statistical guarantees (estimation and variable\nselection) for our estimator and discuss how our approach improves upon\nexisting estimators. Our numerical experiments on real/synthetic datasets\nsuggest that our method can solve, to near-optimality, problem instances with\n$p = 10^4$ -- corresponding to a symmetric matrix of size $p \\times p$ with\n$p^2/2$ binary variables. We demonstrate the usefulness of GraphL0BnB versus\nvarious state-of-the-art approaches on a range of datasets.\n","authors":["Kayhan Behdin","Wenyu Chen","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2307.09366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09365v1","updated":"2023-07-18T15:48:53Z","published":"2023-07-18T15:48:53Z","title":"An Evaluation of Zero-Cost Proxies -- from Neural Architecture\n Performance to Model Robustness","summary":" Zero-cost proxies are nowadays frequently studied and used to search for\nneural architectures. They show an impressive ability to predict the\nperformance of architectures by making use of their untrained weights. These\ntechniques allow for immense search speed-ups. So far the joint search for\nwell-performing and robust architectures has received much less attention in\nthe field of NAS. Therefore, the main focus of zero-cost proxies is the clean\naccuracy of architectures, whereas the model robustness should play an evenly\nimportant part. In this paper, we analyze the ability of common zero-cost\nproxies to serve as performance predictors for robustness in the popular\nNAS-Bench-201 search space. We are interested in the single prediction task for\nrobustness and the joint multi-objective of clean and robust accuracy. We\nfurther analyze the feature importance of the proxies and show that predicting\nthe robustness makes the prediction task from existing zero-cost proxies more\nchallenging. As a result, the joint consideration of several proxies becomes\nnecessary to predict a model's robustness while the clean accuracy can be\nregressed from a single such feature.\n","authors":["Jovita Lukasik","Michael Moeller","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.09365v1.pdf","comment":"Accepted at DAGM GCPR 2023"},{"id":"http://arxiv.org/abs/2307.09361v1","updated":"2023-07-18T15:46:20Z","published":"2023-07-18T15:46:20Z","title":"MOCA: Self-supervised Representation Learning by Predicting Masked\n Online Codebook Assignments","summary":" Self-supervised learning can be used for mitigating the greedy needs of\nVision Transformer networks for very large fully-annotated datasets. Different\nclasses of self-supervised learning offer representations with either good\ncontextual reasoning properties, e.g., using masked image modeling strategies,\nor invariance to image perturbations, e.g., with contrastive methods. In this\nwork, we propose a single-stage and standalone method, MOCA, which unifies both\ndesired properties using novel mask-and-predict objectives defined with\nhigh-level features (instead of pixel-level details). Moreover, we show how to\neffectively employ both learning paradigms in a synergistic and\ncomputation-efficient way. Doing so, we achieve new state-of-the-art results on\nlow-shot settings and strong experimental results in various evaluation\nprotocols with a training that is at least 3 times faster than prior methods.\n","authors":["Spyros Gidaris","Andrei Bursuc","Oriane Simeoni","Antonin Vobecky","Nikos Komodakis","Matthieu Cord","Patrick Pérez"],"pdf_url":"https://arxiv.org/pdf/2307.09361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09357v1","updated":"2023-07-18T15:44:24Z","published":"2023-07-18T15:44:24Z","title":"Using the IBM Analog In-Memory Hardware Acceleration Kit for Neural\n Network Training and Inference","summary":" Analog In-Memory Computing (AIMC) is a promising approach to reduce the\nlatency and energy consumption of Deep Neural Network (DNN) inference and\ntraining. However, the noisy and non-linear device characteristics, and the\nnon-ideal peripheral circuitry in AIMC chips, require adapting DNNs to be\ndeployed on such hardware to achieve equivalent accuracy to digital computing.\nIn this tutorial, we provide a deep dive into how such adaptations can be\nachieved and evaluated using the recently released IBM Analog Hardware\nAcceleration Kit (AIHWKit), freely available at https://github.com/IBM/aihwkit.\nThe AIHWKit is a Python library that simulates inference and training of DNNs\nusing AIMC. We present an in-depth description of the AIHWKit design,\nfunctionality, and best practices to properly perform inference and training.\nWe also present an overview of the Analog AI Cloud Composer, that provides the\nbenefits of using the AIHWKit simulation platform in a fully managed cloud\nsetting. Finally, we show examples on how users can expand and customize\nAIHWKit for their own needs. This tutorial is accompanied by comprehensive\nJupyter Notebook code examples that can be run using AIHWKit, which can be\ndownloaded from https://github.com/IBM/aihwkit/tree/master/notebooks/tutorial.\n","authors":["Manuel Le Gallo","Corey Lammie","Julian Buechel","Fabio Carta","Omobayode Fagbohungbe","Charles Mackin","Hsinyu Tsai","Vijay Narayanan","Abu Sebastian","Kaoutar El Maghraoui","Malte J. Rasch"],"pdf_url":"https://arxiv.org/pdf/2307.09357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00422v6","updated":"2023-07-18T15:31:45Z","published":"2023-02-01T13:14:26Z","title":"Robust online active learning","summary":" In many industrial applications, obtaining labeled observations is not\nstraightforward as it often requires the intervention of human experts or the\nuse of expensive testing equipment. In these circumstances, active learning can\nbe highly beneficial in suggesting the most informative data points to be used\nwhen fitting a model. Reducing the number of observations needed for model\ndevelopment alleviates both the computational burden required for training and\nthe operational expenses related to labeling. Online active learning, in\nparticular, is useful in high-volume production processes where the decision\nabout the acquisition of the label for a data point needs to be taken within an\nextremely short time frame. However, despite the recent efforts to develop\nonline active learning strategies, the behavior of these methods in the\npresence of outliers has not been thoroughly examined. In this work, we\ninvestigate the performance of online active linear regression in contaminated\ndata streams. Our study shows that the currently available query strategies are\nprone to sample outliers, whose inclusion in the training set eventually\ndegrades the predictive performance of the models. To address this issue, we\npropose a solution that bounds the search area of a conditional D-optimal\nalgorithm and uses a robust estimator. Our approach strikes a balance between\nexploring unseen regions of the input space and protecting against outliers.\nThrough numerical simulations, we show that the proposed method is effective in\nimproving the performance of online active learning in the presence of\noutliers, thus expanding the potential applications of this powerful tool.\n","authors":["Davide Cacciarelli","Murat Kulahci","John Sølve Tyssedal"],"pdf_url":"https://arxiv.org/pdf/2302.00422v6.pdf","comment":"Published in Quality and Reliability Engineering International (2023)"},{"id":"http://arxiv.org/abs/2304.12906v2","updated":"2023-07-18T15:31:25Z","published":"2023-04-25T15:21:12Z","title":"The Score-Difference Flow for Implicit Generative Modeling","summary":" Implicit generative modeling (IGM) aims to produce samples of synthetic data\nmatching the characteristics of a target data distribution. Recent work (e.g.\nscore-matching networks, diffusion models) has approached the IGM problem from\nthe perspective of pushing synthetic source data toward the target distribution\nvia dynamical perturbations or flows in the ambient space. In this direction,\nwe present the score difference (SD) between arbitrary target and source\ndistributions as a flow that optimally reduces the Kullback-Leibler divergence\nbetween them while also solving the Schroedinger bridge problem. We apply the\nSD flow to convenient proxy distributions, which are aligned if and only if the\noriginal distributions are aligned. We demonstrate the formal equivalence of\nthis formulation to denoising diffusion models under certain conditions. We\nalso show that the training of generative adversarial networks includes a\nhidden data-optimization sub-problem, which induces the SD flow under certain\nchoices of loss function when the discriminator is optimal. As a result, the SD\nflow provides a theoretical link between model classes that individually\naddress the three challenges of the \"generative modeling trilemma\" -- high\nsample quality, mode coverage, and fast sampling -- thereby setting the stage\nfor a unified approach.\n","authors":["Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2304.12906v2.pdf","comment":"25 pages, 5 figures, 4 tables. To appear in Transactions on Machine\n Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2307.04550v2","updated":"2023-07-18T15:30:30Z","published":"2023-07-10T13:29:23Z","title":"Gradient Surgery for One-shot Unlearning on Generative Model","summary":" Recent regulation on right-to-be-forgotten emerges tons of interest in\nunlearning pre-trained machine learning models. While approximating a\nstraightforward yet expensive approach of retrain-from-scratch, recent machine\nunlearning methods unlearn a sample by updating weights to remove its influence\non the weight parameters. In this paper, we introduce a simple yet effective\napproach to remove a data influence on the deep generative model. Inspired by\nworks in multi-task learning, we propose to manipulate gradients to regularize\nthe interplay of influence among samples by projecting gradients onto the\nnormal plane of the gradients to be retained. Our work is agnostic to\nstatistics of the removal samples, outperforming existing baselines while\nproviding theoretical analysis for the first time in unlearning a generative\nmodel.\n","authors":["Seohui Bae","Seoyoon Kim","Hyemin Jung","Woohyung Lim"],"pdf_url":"https://arxiv.org/pdf/2307.04550v2.pdf","comment":"ICML 2023 Workshop on Generative AI & Law"},{"id":"http://arxiv.org/abs/2307.08674v2","updated":"2023-07-18T15:29:00Z","published":"2023-07-17T17:36:09Z","title":"TableGPT: Towards Unifying Tables, Nature Language and Commands into One\n GPT","summary":" Tables are prevalent in real-world databases, requiring significant time and\neffort for humans to analyze and manipulate. The advancements in large language\nmodels (LLMs) have made it possible to interact with tables using natural\nlanguage input, bringing this capability closer to reality. In this paper, we\npresent TableGPT, a unified fine-tuned framework that enables LLMs to\nunderstand and operate on tables using external functional commands. It\nintroduces the capability to seamlessly interact with tables, enabling a wide\nrange of functionalities such as question answering, data manipulation (e.g.,\ninsert, delete, query, and modify operations), data visualization, analysis\nreport generation, and automated prediction. TableGPT aims to provide\nconvenience and accessibility to users by empowering them to effortlessly\nleverage tabular data. At the core of TableGPT lies the novel concept of global\ntabular representations, which empowers LLMs to gain a comprehensive\nunderstanding of the entire table beyond meta-information. By jointly training\nLLMs on both table and text modalities, TableGPT achieves a deep understanding\nof tabular data and the ability to perform complex operations on tables through\nchain-of-command instructions. Importantly, TableGPT offers the advantage of\nbeing a self-contained system rather than relying on external API interfaces.\nMoreover, it supports efficient data process flow, query rejection (when\nappropriate) and private deployment, enabling faster domain data fine-tuning\nand ensuring data privacy, which enhances the framework's adaptability to\nspecific use cases.\n","authors":["Liangyu Zha","Junlin Zhou","Liyao Li","Rui Wang","Qingyi Huang","Saisai Yang","Jing Yuan","Changbao Su","Xiang Li","Aofeng Su","Tao Zhang","Chen Zhou","Kaizhe Shou","Miao Wang","Wufang Zhu","Guoshan Lu","Chao Ye","Yali Ye","Wentao Ye","Yiming Zhang","Xinglong Deng","Jie Xu","Haobo Wang","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.08674v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2204.03719v2","updated":"2023-07-18T15:28:39Z","published":"2022-04-07T20:13:55Z","title":"A survey on learning from imbalanced data streams: taxonomy, challenges,\n empirical study, and reproducible experimental framework","summary":" Class imbalance poses new challenges when it comes to classifying data\nstreams. Many algorithms recently proposed in the literature tackle this\nproblem using a variety of data-level, algorithm-level, and ensemble\napproaches. However, there is a lack of standardized and agreed-upon procedures\nand benchmarks on how to evaluate these algorithms. This work proposes a\nstandardized, exhaustive, and comprehensive experimental framework to evaluate\nalgorithms in a collection of diverse and challenging imbalanced data stream\nscenarios. The experimental study evaluates 24 state-of-the-art data streams\nalgorithms on 515 imbalanced data streams that combine static and dynamic class\nimbalance ratios, instance-level difficulties, concept drift, real-world and\nsemi-synthetic datasets in binary and multi-class scenarios. This leads to a\nlarge-scale experimental study comparing state-of-the-art classifiers in the\ndata stream mining domain. We discuss the advantages and disadvantages of\nstate-of-the-art classifiers in each of these scenarios and we provide general\nrecommendations to end-users for selecting the best algorithms for imbalanced\ndata streams. Additionally, we formulate open challenges and future directions\nfor this domain. Our experimental framework is fully reproducible and easy to\nextend with new methods. This way, we propose a standardized approach to\nconducting experiments in imbalanced data streams that can be used by other\nresearchers to create complete, trustworthy, and fair evaluation of newly\nproposed methods. Our experimental framework can be downloaded from\nhttps://github.com/canoalberto/imbalanced-streams.\n","authors":["Gabriel Aguiar","Bartosz Krawczyk","Alberto Cano"],"pdf_url":"https://arxiv.org/pdf/2204.03719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09342v1","updated":"2023-07-18T15:26:46Z","published":"2023-07-18T15:26:46Z","title":"Learning to Select SAT Encodings for Pseudo-Boolean and Linear Integer\n Constraints","summary":" Many constraint satisfaction and optimisation problems can be solved\neffectively by encoding them as instances of the Boolean Satisfiability problem\n(SAT). However, even the simplest types of constraints have many encodings in\nthe literature with widely varying performance, and the problem of selecting\nsuitable encodings for a given problem instance is not trivial. We explore the\nproblem of selecting encodings for pseudo-Boolean and linear constraints using\na supervised machine learning approach. We show that it is possible to select\nencodings effectively using a standard set of features for constraint problems;\nhowever we obtain better performance with a new set of features specifically\ndesigned for the pseudo-Boolean and linear constraints. In fact, we achieve\ngood results when selecting encodings for unseen problem classes. Our results\ncompare favourably to AutoFolio when using the same feature set. We discuss the\nrelative importance of instance features to the task of selecting the best\nencodings, and compare several variations of the machine learning method.\n","authors":["Felix Ulrich-Oltean","Peter Nightingale","James Alfred Walker"],"pdf_url":"https://arxiv.org/pdf/2307.09342v1.pdf","comment":"24 pages, 10 figures, submitted to Constraints Journal (Springer)"},{"id":"http://arxiv.org/abs/2307.08558v2","updated":"2023-07-18T15:23:48Z","published":"2023-07-17T15:15:47Z","title":"Deep Learning with Passive Optical Nonlinear Mapping","summary":" Deep learning has fundamentally transformed artificial intelligence, but the\never-increasing complexity in deep learning models calls for specialized\nhardware accelerators. Optical accelerators can potentially offer enhanced\nperformance, scalability, and energy efficiency. However, achieving nonlinear\nmapping, a critical component of neural networks, remains challenging\noptically. Here, we introduce a design that leverages multiple scattering in a\nreverberating cavity to passively induce optical nonlinear random mapping,\nwithout the need for additional laser power. A key advantage emerging from our\nwork is that we show we can perform optical data compression, facilitated by\nmultiple scattering in the cavity, to efficiently compress and retain vital\ninformation while also decreasing data dimensionality. This allows rapid\noptical information processing and generation of low dimensional mixtures of\nhighly nonlinear features. These are particularly useful for applications\ndemanding high-speed analysis and responses such as in edge computing devices.\nUtilizing rapid optical information processing capabilities, our optical\nplatforms could potentially offer more efficient and real-time processing\nsolutions for a broad range of applications. We demonstrate the efficacy of our\ndesign in improving computational performance across tasks, including\nclassification, image reconstruction, key-point detection, and object\ndetection, all achieved through optical data compression combined with a\ndigital decoder. Notably, we observed high performance, at an extreme\ncompression ratio, for real-time pedestrian detection. Our findings pave the\nway for novel algorithms and architectural designs for optical computing.\n","authors":["Fei Xia","Kyungduk Kim","Yaniv Eliezer","Liam Shaughnessy","Sylvain Gigan","Hui Cao"],"pdf_url":"https://arxiv.org/pdf/2307.08558v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.15764v2","updated":"2023-07-18T15:07:01Z","published":"2023-06-27T19:28:41Z","title":"High Fidelity Image Counterfactuals with Probabilistic Causal Models","summary":" We present a general causal generative modelling framework for accurate\nestimation of high fidelity image counterfactuals with deep structural causal\nmodels. Estimation of interventional and counterfactual queries for\nhigh-dimensional structured variables, such as images, remains a challenging\ntask. We leverage ideas from causal mediation analysis and advances in\ngenerative modelling to design new deep causal mechanisms for structured\nvariables in causal models. Our experiments demonstrate that our proposed\nmechanisms are capable of accurate abduction and estimation of direct, indirect\nand total effects as measured by axiomatic soundness of counterfactuals.\n","authors":["Fabio De Sousa Ribeiro","Tian Xia","Miguel Monteiro","Nick Pawlowski","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2306.15764v2.pdf","comment":"ICML2023 publication"},{"id":"http://arxiv.org/abs/2305.16189v2","updated":"2023-07-18T15:05:03Z","published":"2023-05-25T15:49:38Z","title":"Martian time-series unraveled: A multi-scale nested approach with\n factorial variational autoencoders","summary":" Unsupervised source separation involves unraveling an unknown set of source\nsignals recorded through a mixing operator, with limited prior knowledge about\nthe sources, and only access to a dataset of signal mixtures. This problem is\ninherently ill-posed and is further challenged by the variety of time-scales\nexhibited by sources in time series data. Existing methods typically rely on a\npreselected window size that limits their capacity to handle multi-scale\nsources. To address this issue, instead of operating in the time domain, we\npropose an unsupervised multi-scale clustering and source separation framework\nby leveraging wavelet scattering covariances that provide a low-dimensional\nrepresentation of stochastic processes, capable of distinguishing between\ndifferent non-Gaussian stochastic processes. Nested within this representation\nspace, we develop a factorial Gaussian-mixture variational autoencoder that is\ntrained to (1) probabilistically cluster sources at different time-scales and\n(2) independently sample scattering covariance representations associated with\neach cluster. Using samples from each cluster as prior information, we\nformulate source separation as an optimization problem in the wavelet\nscattering covariance representation space, resulting in separated sources in\nthe time domain. When applied to seismic data recorded during the NASA InSight\nmission on Mars, our multi-scale nested approach proves to be a powerful tool\nfor discriminating between sources varying greatly in time-scale, e.g.,\nminute-long transient one-sided pulses (known as ``glitches'') and structured\nambient noises resulting from atmospheric activities that typically last for\ntens of minutes. These results provide an opportunity to conduct further\ninvestigations into the isolated sources related to atmospheric-surface\ninteractions, thermal relaxations, and other complex phenomena.\n","authors":["Ali Siahkoohi","Rudy Morel","Randall Balestriero","Erwan Allys","Grégory Sainton","Taichi Kawamura","Maarten V. de Hoop"],"pdf_url":"https://arxiv.org/pdf/2305.16189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09321v1","updated":"2023-07-18T15:03:56Z","published":"2023-07-18T15:03:56Z","title":"Exploiting Field Dependencies for Learning on Categorical Data","summary":" Traditional approaches for learning on categorical data underexploit the\ndependencies between columns (\\aka fields) in a dataset because they rely on\nthe embedding of data points driven alone by the classification/regression\nloss. In contrast, we propose a novel method for learning on categorical data\nwith the goal of exploiting dependencies between fields. Instead of modelling\nstatistics of features globally (i.e., by the covariance matrix of features),\nwe learn a global field dependency matrix that captures dependencies between\nfields and then we refine the global field dependency matrix at the\ninstance-wise level with different weights (so-called local dependency\nmodelling) w.r.t. each field to improve the modelling of the field\ndependencies. Our algorithm exploits the meta-learning paradigm, i.e., the\ndependency matrices are refined in the inner loop of the meta-learning\nalgorithm without the use of labels, whereas the outer loop intertwines the\nupdates of the embedding matrix (the matrix performing projection) and global\ndependency matrix in a supervised fashion (with the use of labels). Our method\nis simple yet it outperforms several state-of-the-art methods on six popular\ndataset benchmarks. Detailed ablation studies provide additional insights into\nour method.\n","authors":["Zhibin Li","Piotr Koniusz","Lu Zhang","Daniel Edward Pagendam","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2307.09321v1.pdf","comment":"IEEE Transactions on Pattern Analysis and Machine Intelligence\n (submitted June 2022, accepted July 2023)"},{"id":"http://arxiv.org/abs/2307.09320v1","updated":"2023-07-18T15:03:40Z","published":"2023-07-18T15:03:40Z","title":"Biomaker CA: a Biome Maker project using Cellular Automata","summary":" We introduce Biomaker CA: a Biome Maker project using Cellular Automata (CA).\nIn Biomaker CA, morphogenesis is a first class citizen and small seeds need to\ngrow into plant-like organisms to survive in a nutrient starved environment and\neventually reproduce with variation so that a biome survives for long\ntimelines. We simulate complex biomes by means of CA rules in 2D grids and\nparallelize all of its computation on GPUs through the Python JAX framework. We\nshow how this project allows for several different kinds of environments and\nlaws of 'physics', alongside different model architectures and mutation\nstrategies. We further analyze some configurations to show how plant agents can\ngrow, survive, reproduce, and evolve, forming stable and unstable biomes. We\nthen demonstrate how one can meta-evolve models to survive in a harsh\nenvironment either through end-to-end meta-evolution or by a more surgical and\nefficient approach, called Petri dish meta-evolution. Finally, we show how to\nperform interactive evolution, where the user decides how to evolve a plant\nmodel interactively and then deploys it in a larger environment. We open source\nBiomaker CA at: https://tinyurl.com/2x8yu34s .\n","authors":["Ettore Randazzo","Alexander Mordvintsev"],"pdf_url":"https://arxiv.org/pdf/2307.09320v1.pdf","comment":"20 pages, 23 figures. For code base, see https://tinyurl.com/2x8yu34s"},{"id":"http://arxiv.org/abs/2307.09312v1","updated":"2023-07-18T14:57:12Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks. In contrast to traditional text-only methods, our approach to\nlabelling a comment as hate speech centers around the holistic analysis of text\nand images. This is done by leveraging graph transformers to capture the\ncontextual relationships in the entire discussion that surrounds a comment,\nwith interwoven fusion layers to combine text and image embeddings instead of\nprocessing different modalities separately. We compare the performance of our\nmodel to baselines that only process text; we also conduct extensive ablation\nstudies. We conclude with future work for multimodal solutions to deliver\nsocial value in online contexts, arguing that capturing a holistic view of a\nconversation greatly advances the effort to detect anti-social behavior.\n","authors":["Liam Hebert","Gaurav Sahu","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v1.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2307.09311v1","updated":"2023-07-18T14:56:12Z","published":"2023-07-18T14:56:12Z","title":"Automatic Differentiation for Inverse Problems with Applications in\n Quantum Transport","summary":" A neural solver and differentiable simulation of the quantum transmitting\nboundary model is presented for the inverse quantum transport problem. The\nneural solver is used to engineer continuous transmission properties and the\ndifferentiable simulation is used to engineer current-voltage characteristics.\n","authors":["Ivan Williams","Eric Polizzi"],"pdf_url":"https://arxiv.org/pdf/2307.09311v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.09306v1","updated":"2023-07-18T14:52:08Z","published":"2023-07-18T14:52:08Z","title":"EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory\n Forecasting","summary":" Capturing high-dimensional social interactions and feasible futures is\nessential for predicting trajectories. To address this complex nature, several\nattempts have been devoted to reducing the dimensionality of the output\nvariables via parametric curve fitting such as the B\\'ezier curve and B-spline\nfunction. However, these functions, which originate in computer graphics\nfields, are not suitable to account for socially acceptable human dynamics. In\nthis paper, we present EigenTrajectory ($\\mathbb{ET}$), a trajectory prediction\napproach that uses a novel trajectory descriptor to form a compact space, known\nhere as $\\mathbb{ET}$ space, in place of Euclidean space, for representing\npedestrian movements. We first reduce the complexity of the trajectory\ndescriptor via a low-rank approximation. We transform the pedestrians' history\npaths into our $\\mathbb{ET}$ space represented by spatio-temporal principle\ncomponents, and feed them into off-the-shelf trajectory forecasting models. The\ninputs and outputs of the models as well as social interactions are all\ngathered and aggregated in the corresponding $\\mathbb{ET}$ space. Lastly, we\npropose a trajectory anchor-based refinement method to cover all possible\nfutures in the proposed $\\mathbb{ET}$ space. Extensive experiments demonstrate\nthat our EigenTrajectory predictor can significantly improve both the\nprediction accuracy and reliability of existing trajectory forecasting models\non public benchmarks, indicating that the proposed descriptor is suited to\nrepresent pedestrian behaviors. Code is publicly available at\nhttps://github.com/inhwanbae/EigenTrajectory .\n","authors":["Inhwan Bae","Jean Oh","Hae-Gon Jeon"],"pdf_url":"https://arxiv.org/pdf/2307.09306v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2304.02011v2","updated":"2023-07-18T14:45:44Z","published":"2023-04-04T17:59:09Z","title":"FakET: Simulating Cryo-Electron Tomograms with Neural Style Transfer","summary":" Particle localization and -classification constitute two of the most\nfundamental problems in computational microscopy. In recent years, deep\nlearning based approaches have been introduced for these tasks with great\nsuccess. A key shortcoming of these supervised learning methods is their need\nfor large training data sets, typically generated from particle models in\nconjunction with complex numerical forward models simulating the physics of\ntransmission electron microscopes. Computer implementations of such forward\nmodels are computationally extremely demanding and limit the scope of their\napplicability. In this paper we propose a method for simulating the forward\noperator of an electron microscope based on additive noise and Neural Style\nTransfer techniques. We evaluate the method on localization and classification\ntasks using one of the established state-of-the-art architectures showing\nperformance on par with the benchmark. In contrast to previous approaches, our\nmethod accelerates the data generation process by a factor of 750 while using\n33 times less memory and scales well to typical transmission electron\nmicroscope detector sizes. It utilizes GPU acceleration and parallel\nprocessing. It can be used to adapt a synthetic training data set according to\nreference data from any transmission electron microscope. The source code is\navailable at https://gitlab.com/deepet/faket.\n","authors":["Pavol Harar","Lukas Herrmann","Philipp Grohs","David Haselbach"],"pdf_url":"https://arxiv.org/pdf/2304.02011v2.pdf","comment":"18 pages, 1 table, 16 figures. Included fine-tuning, ablation, and\n noiseless experiments"},{"id":"http://arxiv.org/abs/2307.09302v1","updated":"2023-07-18T14:40:48Z","published":"2023-07-18T14:40:48Z","title":"Conformal prediction under ambiguous ground truth","summary":" In safety-critical classification tasks, conformal prediction allows to\nperform rigorous uncertainty quantification by providing confidence sets\nincluding the true class with a user-specified probability. This generally\nassumes the availability of a held-out calibration set with access to ground\ntruth labels. Unfortunately, in many domains, such labels are difficult to\nobtain and usually approximated by aggregating expert opinions. In fact, this\nholds true for almost all datasets, including well-known ones such as CIFAR and\nImageNet. Applying conformal prediction using such labels underestimates\nuncertainty. Indeed, when expert opinions are not resolvable, there is inherent\nambiguity present in the labels. That is, we do not have ``crisp'', definitive\nground truth labels and this uncertainty should be taken into account during\ncalibration. In this paper, we develop a conformal prediction framework for\nsuch ambiguous ground truth settings which relies on an approximation of the\nunderlying posterior distribution of labels given inputs. We demonstrate our\nmethodology on synthetic and real datasets, including a case study of skin\ncondition classification in dermatology.\n","authors":["David Stutz","Abhijit Guha Roy","Tatiana Matejovicova","Patricia Strachan","Ali Taylan Cemgil","Arnaud Doucet"],"pdf_url":"https://arxiv.org/pdf/2307.09302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09286v1","updated":"2023-07-18T14:30:47Z","published":"2023-07-18T14:30:47Z","title":"FlexiAST: Flexibility is What AST Needs","summary":" The objective of this work is to give patch-size flexibility to Audio\nSpectrogram Transformers (AST). Recent advancements in ASTs have shown superior\nperformance in various audio-based tasks. However, the performance of standard\nASTs degrades drastically when evaluated using different patch sizes from that\nused during training. As a result, AST models are typically re-trained to\naccommodate changes in patch sizes. To overcome this limitation, this paper\nproposes a training procedure to provide flexibility to standard AST models\nwithout architectural changes, allowing them to work with various patch sizes\nat the inference stage - FlexiAST. This proposed training approach simply\nutilizes random patch size selection and resizing of patch and positional\nembedding weights. Our experiments show that FlexiAST gives similar performance\nto standard AST models while maintaining its evaluation ability at various\npatch sizes on different datasets for audio classification tasks.\n","authors":["Jiu Feng","Mehmet Hamza Erol","Joon Son Chung","Arda Senocak"],"pdf_url":"https://arxiv.org/pdf/2307.09286v1.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2302.00999v2","updated":"2023-07-18T14:19:07Z","published":"2023-02-02T10:37:23Z","title":"High-Probability Bounds for Stochastic Optimization and Variational\n Inequalities: the Case of Unbounded Variance","summary":" During recent years the interest of optimization and machine learning\ncommunities in high-probability convergence of stochastic optimization methods\nhas been growing. One of the main reasons for this is that high-probability\ncomplexity bounds are more accurate and less studied than in-expectation ones.\nHowever, SOTA high-probability non-asymptotic convergence results are derived\nunder strong assumptions such as the boundedness of the gradient noise variance\nor of the objective's gradient itself. In this paper, we propose several\nalgorithms with high-probability convergence results under less restrictive\nassumptions. In particular, we derive new high-probability convergence results\nunder the assumption that the gradient/operator noise has bounded central\n$\\alpha$-th moment for $\\alpha \\in (1,2]$ in the following setups: (i) smooth\nnon-convex / Polyak-Lojasiewicz / convex / strongly convex / quasi-strongly\nconvex minimization problems, (ii) Lipschitz / star-cocoercive and monotone /\nquasi-strongly monotone variational inequalities. These results justify the\nusage of the considered methods for solving problems that do not fit standard\nfunctional classes studied in stochastic optimization.\n","authors":["Abdurakhmon Sadiev","Marina Danilova","Eduard Gorbunov","Samuel Horváth","Gauthier Gidel","Pavel Dvurechensky","Alexander Gasnikov","Peter Richtárik"],"pdf_url":"https://arxiv.org/pdf/2302.00999v2.pdf","comment":"ICML 2023. 86 pages. Changes in v2: ICML formatting was applied along\n with minor edits of the text"},{"id":"http://arxiv.org/abs/2307.08535v2","updated":"2023-07-18T14:11:18Z","published":"2023-07-17T14:52:52Z","title":"Multi-class point cloud completion networks for 3D cardiac anatomy\n reconstruction from cine magnetic resonance images","summary":" Cine magnetic resonance imaging (MRI) is the current gold standard for the\nassessment of cardiac anatomy and function. However, it typically only acquires\na set of two-dimensional (2D) slices of the underlying three-dimensional (3D)\nanatomy of the heart, thus limiting the understanding and analysis of both\nhealthy and pathological cardiac morphology and physiology. In this paper, we\npropose a novel fully automatic surface reconstruction pipeline capable of\nreconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI\nacquisitions. Its key component is a multi-class point cloud completion network\n(PCCN) capable of correcting both the sparsity and misalignment issues of the\n3D reconstruction task in a unified model. We first evaluate the PCCN on a\nlarge synthetic dataset of biventricular anatomies and observe Chamfer\ndistances between reconstructed and gold standard anatomies below or similar to\nthe underlying image resolution for multiple levels of slice misalignment.\nFurthermore, we find a reduction in reconstruction error compared to a\nbenchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean\nsurface distance, respectively. We then apply the PCCN as part of our automated\nreconstruction pipeline to 1000 subjects from the UK Biobank study in a\ncross-domain transfer setting and demonstrate its ability to reconstruct\naccurate and topologically plausible biventricular heart meshes with clinical\nmetrics comparable to the previous literature. Finally, we investigate the\nrobustness of our proposed approach and observe its capacity to successfully\nhandle multiple common outlier conditions.\n","authors":["Marcel Beetz","Abhirup Banerjee","Julius Ossenberg-Engels","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.08535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06534v3","updated":"2023-07-18T14:10:16Z","published":"2023-06-10T22:24:42Z","title":"K-Tensors: Clustering Positive Semi-Definite Matrices","summary":" This paper introduces a novel self-consistency clustering algorithm\n($K$-Tensors) designed for {partitioning a distribution of}\npositive-semidefinite matrices based on their eigenstructures. As positive\nsemi-definite matrices can be represented as ellipsoids in $\\mathbb R^p$, $p\n\\ge 2$, it is critical to maintain their structural information to perform\neffective clustering. However, traditional clustering algorithms {applied to\nmatrices} often {involve vectorization of} the matrices, resulting in a loss of\nessential structural information. To address this issue, we propose a distance\nmetric {for clustering} that is specifically based on the structural\ninformation of positive semi-definite matrices. This distance metric enables\nthe clustering algorithm to consider the differences between positive\nsemi-definite matrices and their projections onto {a} common space spanned by\n\\thadJulyTen{orthonormal vectors defined from a set of} positive semi-definite\nmatrices. This innovative approach to clustering positive semi-definite\nmatrices has broad applications in several domains including financial and\nbiomedical research, such as analyzing functional connectivity data. By\nmaintaining the structural information of positive semi-definite matrices, our\nproposed algorithm promises to cluster the positive semi-definite matrices in a\nmore meaningful way, thereby facilitating deeper insights into the underlying\ndata in various applications.\n","authors":["Hanchao Zhang","Thaddeus Tarpey"],"pdf_url":"https://arxiv.org/pdf/2306.06534v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10381v3","updated":"2023-07-18T13:55:14Z","published":"2022-06-21T13:28:57Z","title":"TabText: A Flexible and Contextual Approach to Tabular Data\n Representation","summary":" Tabular data is essential for applying machine learning tasks across various\nindustries. However, traditional data processing methods do not fully utilize\nall the information available in the tables, ignoring important contextual\ninformation such as column header descriptions. In addition, pre-processing\ndata into a tabular format can remain a labor-intensive bottleneck in model\ndevelopment. This work introduces TabText, a processing and feature extraction\nframework that extracts contextual information from tabular data structures.\nTabText addresses processing difficulties by converting the content into\nlanguage and utilizing pre-trained large language models (LLMs). We evaluate\nour framework on nine healthcare prediction tasks ranging from patient\ndischarge, ICU admission, and mortality. We show that 1) applying our TabText\nframework enables the generation of high-performing and simple machine learning\nbaseline models with minimal data pre-processing, and 2) augmenting\npre-processed tabular data with TabText representations improves the average\nand worst-case AUC performance of standard machine learning models by as much\nas 6%.\n","authors":["Kimberly Villalobos Carballo","Liangyuan Na","Yu Ma","Léonard Boussioux","Cynthia Zeng","Luis R. Soenksen","Dimitris Bertsimas"],"pdf_url":"https://arxiv.org/pdf/2206.10381v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09269v1","updated":"2023-07-18T13:52:12Z","published":"2023-07-18T13:52:12Z","title":"End-to-End Neural Network Training for Hyperbox-Based Classification","summary":" Hyperbox-based classification has been seen as a promising technique in which\ndecisions on the data are represented as a series of orthogonal,\nmultidimensional boxes (i.e., hyperboxes) that are often interpretable and\nhuman-readable. However, existing methods are no longer capable of efficiently\nhandling the increasing volume of data many application domains face nowadays.\nWe address this gap by proposing a novel, fully differentiable framework for\nhyperbox-based classification via neural networks. In contrast to previous\nwork, our hyperbox models can be efficiently trained in an end-to-end fashion,\nwhich leads to significantly reduced training times and superior classification\nresults.\n","authors":["Denis Mayr Lima Martins","Christian Lülf","Fabian Gieseke"],"pdf_url":"https://arxiv.org/pdf/2307.09269v1.pdf","comment":"6 pages, accepted for poster presentation at ESANN 2023"},{"id":"http://arxiv.org/abs/2307.09263v1","updated":"2023-07-18T13:48:05Z","published":"2023-07-18T13:48:05Z","title":"Mobility-Aware Joint User Scheduling and Resource Allocation for Low\n Latency Federated Learning","summary":" As an efficient distributed machine learning approach, Federated learning\n(FL) can obtain a shared model by iterative local model training at the user\nside and global model aggregating at the central server side, thereby\nprotecting privacy of users. Mobile users in FL systems typically communicate\nwith base stations (BSs) via wireless channels, where training performance\ncould be degraded due to unreliable access caused by user mobility. However,\nexisting work only investigates a static scenario or random initialization of\nuser locations, which fail to capture mobility in real-world networks. To\ntackle this issue, we propose a practical model for user mobility in FL across\nmultiple BSs, and develop a user scheduling and resource allocation method to\nminimize the training delay with constrained communication resources.\nSpecifically, we first formulate an optimization problem with user mobility\nthat jointly considers user selection, BS assignment to users, and bandwidth\nallocation to minimize the latency in each communication round. This\noptimization problem turned out to be NP-hard and we proposed a delay-aware\ngreedy search algorithm (DAGSA) to solve it. Simulation results show that the\nproposed algorithm achieves better performance than the state-of-the-art\nbaselines and a certain level of user mobility could improve training\nperformance.\n","authors":["Kecheng Fan","Wen Chen","Jun Li","Xiumei Deng","Xuefeng Han","Ming Ding"],"pdf_url":"https://arxiv.org/pdf/2307.09263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09259v1","updated":"2023-07-18T13:43:53Z","published":"2023-07-18T13:43:53Z","title":"Adaptive Topological Feature via Persistent Homology: Filtration\n Learning for Point Clouds","summary":" Machine learning for point clouds has been attracting much attention, with\nmany applications in various fields, such as shape recognition and material\nscience. To enhance the accuracy of such machine learning methods, it is known\nto be effective to incorporate global topological features, which are typically\nextracted by persistent homology. In the calculation of persistent homology for\na point cloud, we need to choose a filtration for the point clouds, an\nincreasing sequence of spaces. Because the performance of machine learning\nmethods combined with persistent homology is highly affected by the choice of a\nfiltration, we need to tune it depending on data and tasks. In this paper, we\npropose a framework that learns a filtration adaptively with the use of neural\nnetworks. In order to make the resulting persistent homology\nisometry-invariant, we develop a neural network architecture with such\ninvariance. Additionally, we theoretically show a finite-dimensional\napproximation result that justifies our architecture. Experimental results\ndemonstrated the efficacy of our framework in several classification tasks.\n","authors":["Naoki Nishikawa","Yuichi Ike","Kenji Yamanishi"],"pdf_url":"https://arxiv.org/pdf/2307.09259v1.pdf","comment":"17 pages with 4 figures"},{"id":"http://arxiv.org/abs/2307.09254v1","updated":"2023-07-18T13:36:24Z","published":"2023-07-18T13:36:24Z","title":"PAC Neural Prediction Set Learning to Quantify the Uncertainty of\n Generative Language Models","summary":" Uncertainty learning and quantification of models are crucial tasks to\nenhance the trustworthiness of the models. Importantly, the recent surge of\ngenerative language models (GLMs) emphasizes the need for reliable uncertainty\nquantification due to the concerns on generating hallucinated facts. In this\npaper, we propose to learn neural prediction set models that comes with the\nprobably approximately correct (PAC) guarantee for quantifying the uncertainty\nof GLMs. Unlike existing prediction set models, which are parameterized by a\nscalar value, we propose to parameterize prediction sets via neural networks,\nwhich achieves more precise uncertainty quantification but still satisfies the\nPAC guarantee. We demonstrate the efficacy of our method on four types of\nlanguage datasets and six types of models by showing that our method improves\nthe quantified uncertainty by $63\\%$ on average, compared to a standard\nbaseline method.\n","authors":["Sangdon Park","Taesoo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.09254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05118v2","updated":"2023-07-18T13:29:53Z","published":"2023-03-09T08:57:01Z","title":"SLCA: Slow Learner with Classifier Alignment for Continual Learning on a\n Pre-trained Model","summary":" The goal of continual learning is to improve the performance of recognition\nmodels in learning sequentially arrived data. Although most existing works are\nestablished on the premise of learning from scratch, growing efforts have been\ndevoted to incorporating the benefits of pre-training. However, how to\nadaptively exploit the pre-trained knowledge for each incremental task while\nmaintaining its generalizability remains an open question. In this work, we\npresent an extensive analysis for continual learning on a pre-trained model\n(CLPM), and attribute the key challenge to a progressive overfitting problem.\nObserving that selectively reducing the learning rate can almost resolve this\nissue in the representation layer, we propose a simple but extremely effective\napproach named Slow Learner with Classifier Alignment (SLCA), which further\nimproves the classification layer by modeling the class-wise distributions and\naligning the classification layers in a post-hoc fashion. Across a variety of\nscenarios, our proposal provides substantial improvements for CLPM (e.g., up to\n49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split\nCUB-200 and Split Cars-196, respectively), and thus outperforms\nstate-of-the-art approaches by a large margin. Based on such a strong baseline,\ncritical factors and promising directions are analyzed in-depth to facilitate\nsubsequent research.\n","authors":["Gengwei Zhang","Liyuan Wang","Guoliang Kang","Ling Chen","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2303.05118v2.pdf","comment":"11 pages, 8 figures, accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09249v1","updated":"2023-07-18T13:28:31Z","published":"2023-07-18T13:28:31Z","title":"UniTabE: Pretraining a Unified Tabular Encoder for Heterogeneous Tabular\n Data","summary":" Recent advancements in Natural Language Processing (NLP) have witnessed the\ngroundbreaking impact of pretrained models, yielding impressive outcomes across\nvarious tasks. This study seeks to extend the power of pretraining\nmethodologies to tabular data, a domain traditionally overlooked, yet\ninherently challenging due to the plethora of table schemas intrinsic to\ndifferent tasks. The primary research questions underpinning this work revolve\naround the adaptation to heterogeneous table structures, the establishment of a\nuniversal pretraining protocol for tabular data, the generalizability and\ntransferability of learned knowledge across tasks, the adaptation to diverse\ndownstream applications, and the incorporation of incremental columns over\ntime. In response to these challenges, we introduce UniTabE, a pioneering\nmethod designed to process tables in a uniform manner, devoid of constraints\nimposed by specific table structures. UniTabE's core concept relies on\nrepresenting each basic table element with a module, termed TabUnit. This is\nsubsequently followed by a Transformer encoder to refine the representation.\nMoreover, our model is designed to facilitate pretraining and finetuning\nthrough the utilization of free-form prompts. In order to implement the\npretraining phase, we curated an expansive tabular dataset comprising\napproximately 13 billion samples, meticulously gathered from the Kaggle\nplatform. Rigorous experimental testing and analyses were performed under a\nmyriad of scenarios to validate the effectiveness of our methodology. The\nexperimental results demonstrate UniTabE's superior performance against several\nbaseline models across a multitude of benchmark datasets. This, therefore,\nunderscores UniTabE's potential to significantly enhance the semantic\nrepresentation of tabular data, thereby marking a significant stride in the\nfield of tabular data analysis.\n","authors":["Yazheng Yang","Yuqi Wang","Guang Liu","Ledell Wu","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09249v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.09248v1","updated":"2023-07-18T13:28:30Z","published":"2023-07-18T13:28:30Z","title":"Application of BERT in Wind Power Forecasting-Teletraan's Solution in\n Baidu KDD Cup 2022","summary":" Nowadays, wind energy has drawn increasing attention as its important role in\ncarbon neutrality and sustainable development. When wind power is integrated\ninto the power grid, precise forecasting is necessary for the sustainability\nand security of the system. However, the unpredictable nature and long sequence\nprediction make it especially challenging. In this technical report, we\nintroduce the BERT model applied for Baidu KDD Cup 2022, and the daily\nfluctuation is added by post-processing to make the predicted results in line\nwith daily periodicity. Our solution achieves 3rd place of 2490 teams. The code\nis released athttps://github.com/LongxingTan/KDD2022-Baidu\n","authors":["Longxing Tan","Hongying Yue"],"pdf_url":"https://arxiv.org/pdf/2307.09248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09244v1","updated":"2023-07-18T13:23:23Z","published":"2023-07-18T13:23:23Z","title":"Towards Sustainable Deep Learning for Multi-Label Classification on NILM","summary":" Non-intrusive load monitoring (NILM) is the process of obtaining\nappliance-level data from a single metering point, measuring total electricity\nconsumption of a household or a business. Appliance-level data can be directly\nused for demand response applications and energy management systems as well as\nfor awareness raising and motivation for improvements in energy efficiency and\nreduction in the carbon footprint. Recently, classical machine learning and\ndeep learning (DL) techniques became very popular and proved as highly\neffective for NILM classification, but with the growing complexity these\nmethods are faced with significant computational and energy demands during both\ntheir training and operation. In this paper, we introduce a novel DL model\naimed at enhanced multi-label classification of NILM with improved computation\nand energy efficiency. We also propose a testing methodology for comparison of\ndifferent models using data synthesized from the measurement datasets so as to\nbetter represent real-world scenarios. Compared to the state-of-the-art, the\nproposed model has its carbon footprint reduced by more than 23% while\nproviding on average approximately 8 percentage points in performance\nimprovement when testing on data derived from REFIT and UK-DALE datasets.\n","authors":["Anže Pirnat","Blaž Bertalanič","Gregor Cerar","Mihael Mohorčič","Carolina Fortuna"],"pdf_url":"https://arxiv.org/pdf/2307.09244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09238v1","updated":"2023-07-18T13:18:52Z","published":"2023-07-18T13:18:52Z","title":"Fusing Hand and Body Skeletons for Human Action Recognition in Assembly","summary":" As collaborative robots (cobots) continue to gain popularity in industrial\nmanufacturing, effective human-robot collaboration becomes crucial. Cobots\nshould be able to recognize human actions to assist with assembly tasks and act\nautonomously. To achieve this, skeleton-based approaches are often used due to\ntheir ability to generalize across various people and environments. Although\nbody skeleton approaches are widely used for action recognition, they may not\nbe accurate enough for assembly actions where the worker's fingers and hands\nplay a significant role. To address this limitation, we propose a method in\nwhich less detailed body skeletons are combined with highly detailed hand\nskeletons. We investigate CNNs and transformers, the latter of which are\nparticularly adept at extracting and combining important information from both\nskeleton types using attention. This paper demonstrates the effectiveness of\nour proposed approach in enhancing action recognition in assembly scenarios.\n","authors":["Dustin Aganian","Mona Köhler","Benedict Stephan","Markus Eisenbach","Horst-Michael Gross"],"pdf_url":"https://arxiv.org/pdf/2307.09238v1.pdf","comment":"International Conference on Artificial Neural Networks (ICANN) 2023"},{"id":"http://arxiv.org/abs/2305.07848v3","updated":"2023-07-18T13:13:36Z","published":"2023-05-13T06:27:33Z","title":"Meta-Polyp: a baseline for efficient Polyp segmentation","summary":" In recent years, polyp segmentation has gained significant importance, and\nmany methods have been developed using CNN, Vision Transformer, and Transformer\ntechniques to achieve competitive results. However, these methods often face\ndifficulties when dealing with out-of-distribution datasets, missing\nboundaries, and small polyps. In 2022, Meta-Former was introduced as a new\nbaseline for vision, which not only improved the performance of multi-task\ncomputer vision but also addressed the limitations of the Vision Transformer\nand CNN family backbones. To further enhance segmentation, we propose a fusion\nof Meta-Former with UNet, along with the introduction of a Multi-scale\nUpsampling block with a level-up combination in the decoder stage to enhance\nthe texture, also we propose the Convformer block base on the idea of the\nMeta-former to enhance the crucial information of the local feature. These\nblocks enable the combination of global information, such as the overall shape\nof the polyp, with local information and boundary information, which is crucial\nfor the decision of the medical segmentation. Our proposed approach achieved\ncompetitive performance and obtained the top result in the State of the Art on\nthe CVC-300 dataset, Kvasir, and CVC-ColonDB dataset. Apart from Kvasir-SEG,\nothers are out-of-distribution datasets. The implementation can be found at:\nhttps://github.com/huyquoctrinh/MetaPolyp-CBMS2023.\n","authors":["Quoc-Huy Trinh"],"pdf_url":"https://arxiv.org/pdf/2305.07848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09230v1","updated":"2023-07-18T13:06:17Z","published":"2023-07-18T13:06:17Z","title":"Detecting Throat Cancer from Speech Signals Using Machine Learning: A\n Reproducible Literature Review","summary":" In this work we perform a scoping review of the current literature on the\ndetection of throat cancer from speech recordings using machine learning and\nartificial intelligence. We find 22 papers within this area and discuss their\nmethods and results. We split these papers into two groups - nine performing\nbinary classification, and 13 performing multi-class classification. The papers\npresent a range of methods with neural networks being most commonly\nimplemented. Many features are also extracted from the audio before\nclassification, with the most common bring mel-frequency cepstral coefficients.\nNone of the papers found in this search have associated code repositories and\nas such are not reproducible. Therefore, we create a publicly available code\nrepository of our own classifiers. We use transfer learning on a multi-class\nproblem, classifying three pathologies and healthy controls. Using this\ntechnique we achieve an unweighted average recall of 53.54%, sensitivity of\n83.14%, and specificity of 64.00%. We compare our classifiers with the results\nobtained on the same dataset and find similar results.\n","authors":["Mary Paterson","James Moor","Luisa Cutillo"],"pdf_url":"https://arxiv.org/pdf/2307.09230v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2305.13115v2","updated":"2023-07-18T12:49:53Z","published":"2023-05-22T15:13:51Z","title":"Causal-Based Supervision of Attention in Graph Neural Network: A Better\n and Simpler Choice towards Powerful Attention","summary":" Recent years have witnessed the great potential of attention mechanism in\ngraph representation learning. However, while variants of attention-based GNNs\nare setting new benchmarks for numerous real-world datasets, recent works have\npointed out that their induced attentions are less robust and generalizable\nagainst noisy graphs due to lack of direct supervision. In this paper, we\npresent a new framework which utilizes the tool of causality to provide a\npowerful supervision signal for the learning process of attention functions.\nSpecifically, we estimate the direct causal effect of attention to the final\nprediction, and then maximize such effect to guide attention attending to more\nmeaningful neighbors. Our method can serve as a plug-and-play module for any\ncanonical attention-based GNNs in an end-to-end fashion. Extensive experiments\non a wide range of benchmark datasets illustrated that, by directly supervising\nattention functions, the model is able to converge faster with a clearer\ndecision boundary, and thus yields better performances.\n","authors":["Hongjun Wang","Jiyuan Chen","Lun Du","Qiang Fu","Shi Han","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2305.13115v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09212v1","updated":"2023-07-18T12:47:35Z","published":"2023-07-18T12:47:35Z","title":"How Many Neurons Does it Take to Approximate the Maximum?","summary":" We study the size of a neural network needed to approximate the maximum\nfunction over $d$ inputs, in the most basic setting of approximating with\nrespect to the $L_2$ norm, for continuous distributions, for a network that\nuses ReLU activations. We provide new lower and upper bounds on the width\nrequired for approximation across various depths. Our results establish new\ndepth separations between depth 2 and 3, and depth 3 and 5 networks, as well as\nproviding a depth $\\mathcal{O}(\\log(\\log(d)))$ and width $\\mathcal{O}(d)$\nconstruction which approximates the maximum function, significantly improving\nupon the depth requirements of the best previously known bounds for networks\nwith linearly-bounded width. Our depth separation results are facilitated by a\nnew lower bound for depth 2 networks approximating the maximum function over\nthe uniform distribution, assuming an exponential upper bound on the size of\nthe weights. Furthermore, we are able to use this depth 2 lower bound to\nprovide tight bounds on the number of neurons needed to approximate the maximum\nby a depth 3 network. Our lower bounds are of potentially broad interest as\nthey apply to the widely studied and used \\emph{max} function, in contrast to\nmany previous results that base their bounds on specially constructed or\npathological functions and distributions.\n","authors":["Itay Safran","Daniel Reichman","Paul Valiant"],"pdf_url":"https://arxiv.org/pdf/2307.09212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09209v1","updated":"2023-07-18T12:45:54Z","published":"2023-07-18T12:45:54Z","title":"Automated Ableism: An Exploration of Explicit Disability Biases in\n Sentiment and Toxicity Analysis Models","summary":" We analyze sentiment analysis and toxicity detection models to detect the\npresence of explicit bias against people with disability (PWD). We employ the\nbias identification framework of Perturbation Sensitivity Analysis to examine\nconversations related to PWD on social media platforms, specifically Twitter\nand Reddit, in order to gain insight into how disability bias is disseminated\nin real-world social settings. We then create the \\textit{Bias Identification\nTest in Sentiment} (BITS) corpus to quantify explicit disability bias in any\nsentiment analysis and toxicity detection models. Our study utilizes BITS to\nuncover significant biases in four open AIaaS (AI as a Service) sentiment\nanalysis tools, namely TextBlob, VADER, Google Cloud Natural Language API,\nDistilBERT and two toxicity detection models, namely two versions of\nToxic-BERT. Our findings indicate that all of these models exhibit\nstatistically significant explicit bias against PWD.\n","authors":["Pranav Narayanan Venkit","Mukund Srinath","Shomir Wilson"],"pdf_url":"https://arxiv.org/pdf/2307.09209v1.pdf","comment":"TrustNLP at ACL 2023"},{"id":"http://arxiv.org/abs/2307.09206v1","updated":"2023-07-18T12:42:59Z","published":"2023-07-18T12:42:59Z","title":"Context-Conditional Navigation with a Learning-Based Terrain- and\n Robot-Aware Dynamics Model","summary":" In autonomous navigation settings, several quantities can be subject to\nvariations. Terrain properties such as friction coefficients may vary over time\ndepending on the location of the robot. Also, the dynamics of the robot may\nchange due to, e.g., different payloads, changing the system's mass, or wear\nand tear, changing actuator gains or joint friction. An autonomous agent should\nthus be able to adapt to such variations. In this paper, we develop a novel\nprobabilistic, terrain- and robot-aware forward dynamics model, termed TRADYN,\nwhich is able to adapt to the above-mentioned variations. It builds on recent\nadvances in meta-learning forward dynamics models based on Neural Processes. We\nevaluate our method in a simulated 2D navigation setting with a unicycle-like\nrobot and different terrain layouts with spatially varying friction\ncoefficients. In our experiments, the proposed model exhibits lower prediction\nerror for the task of long-horizon trajectory prediction, compared to\nnon-adaptive ablation models. We also evaluate our model on the downstream task\nof navigation planning, which demonstrates improved performance in planning\ncontrol-efficient paths by taking robot and terrain properties into account.\n","authors":["Suresh Guttikonda","Jan Achterhold","Haolong Li","Joschka Boedecker","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2307.09206v1.pdf","comment":"\\copyright 2023 IEEE. To be presented at the 2023 European Conference\n on Mobile Robots (ECMR)"},{"id":"http://arxiv.org/abs/2307.09205v1","updated":"2023-07-18T12:41:28Z","published":"2023-07-18T12:41:28Z","title":"Learning Dynamic Attribute-factored World Models for Efficient\n Multi-object Reinforcement Learning","summary":" In many reinforcement learning tasks, the agent has to learn to interact with\nmany objects of different types and generalize to unseen combinations and\nnumbers of objects. Often a task is a composition of previously learned tasks\n(e.g. block stacking). These are examples of compositional generalization, in\nwhich we compose object-centric representations to solve complex tasks. Recent\nworks have shown the benefits of object-factored representations and\nhierarchical abstractions for improving sample efficiency in these settings. On\nthe other hand, these methods do not fully exploit the benefits of\nfactorization in terms of object attributes. In this paper, we address this\nopportunity and introduce the Dynamic Attribute FacTored RL (DAFT-RL)\nframework. In DAFT-RL, we leverage object-centric representation learning to\nextract objects from visual inputs. We learn to classify them in classes and\ninfer their latent parameters. For each class of object, we learn a class\ntemplate graph that describes how the dynamics and reward of an object of this\nclass factorize according to its attributes. We also learn an interaction\npattern graph that describes how objects of different classes interact with\neach other at the attribute level. Through these graphs and a dynamic\ninteraction graph that models the interactions between objects, we can learn a\npolicy that can then be directly applied in a new environment by just\nestimating the interactions and latent parameters. We evaluate DAFT-RL in three\nbenchmark datasets and show our framework outperforms the state-of-the-art in\ngeneralizing across unseen objects with varying attributes and latent\nparameters, as well as in the composition of previously learned tasks.\n","authors":["Fan Feng","Sara Magliacane"],"pdf_url":"https://arxiv.org/pdf/2307.09205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09182v1","updated":"2023-07-18T12:05:36Z","published":"2023-07-18T12:05:36Z","title":"Federated Learning for Computationally-Constrained Heterogeneous\n Devices: A Survey","summary":" With an increasing number of smart devices like internet of things (IoT)\ndevices deployed in the field, offloadingtraining of neural networks (NNs) to a\ncentral server becomes more and more infeasible. Recent efforts toimprove\nusers' privacy have led to on-device learning emerging as an alternative.\nHowever, a model trainedonly on a single device, using only local data, is\nunlikely to reach a high accuracy. Federated learning (FL)has been introduced\nas a solution, offering a privacy-preserving trade-off between communication\noverheadand model accuracy by sharing knowledge between devices but disclosing\nthe devices' private data. Theapplicability and the benefit of applying\nbaseline FL are, however, limited in many relevant use cases dueto the\nheterogeneity present in such environments. In this survey, we outline the\nheterogeneity challengesFL has to overcome to be widely applicable in\nreal-world applications. We especially focus on the aspect ofcomputation\nheterogeneity among the participating devices and provide a comprehensive\noverview of recentworks on heterogeneity-aware FL. We discuss two groups: works\nthat adapt the NN architecture and worksthat approach heterogeneity on a system\nlevel, covering Federated Averaging (FedAvg), distillation, and\nsplitlearning-based approaches, as well as synchronous and asynchronous\naggregation schemes.\n","authors":["Kilian Pfeiffer","Martin Rapp","Ramin Khalili","Jörg Henkel"],"pdf_url":"https://arxiv.org/pdf/2307.09182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08533v2","updated":"2023-07-18T11:54:05Z","published":"2023-07-17T14:49:06Z","title":"Nonlinear Processing with Linear Optics","summary":" Deep neural networks have achieved remarkable breakthroughs by leveraging\nmultiple layers of data processing to extract hidden representations, albeit at\nthe cost of large electronic computing power. To enhance energy efficiency and\nspeed, the optical implementation of neural networks aims to harness the\nadvantages of optical bandwidth and the energy efficiency of optical\ninterconnections. In the absence of low-power optical nonlinearities, the\nchallenge in the implementation of multilayer optical networks lies in\nrealizing multiple optical layers without resorting to electronic components.\nIn this study, we present a novel framework that uses multiple scattering that\nis capable of synthesizing programmable linear and nonlinear transformations\nconcurrently at low optical power by leveraging the nonlinear relationship\nbetween the scattering potential, represented by data, and the scattered field.\nTheoretical and experimental investigations show that repeating the data by\nmultiple scattering enables non-linear optical computing at low power\ncontinuous wave light.\n","authors":["Mustafa Yildirim","Niyazi Ulas Dinc","Ilker Oguz","Demetri Psaltis","Christophe Moser"],"pdf_url":"https://arxiv.org/pdf/2307.08533v2.pdf","comment":"20 pages, 9 figures and 1 table"},{"id":"http://arxiv.org/abs/2307.09165v1","updated":"2023-07-18T11:43:01Z","published":"2023-07-18T11:43:01Z","title":"Towards Trustworthy Dataset Distillation","summary":" Efficiency and trustworthiness are two eternal pursuits when applying deep\nlearning in real-world applications. With regard to efficiency, dataset\ndistillation (DD) endeavors to reduce training costs by distilling the large\ndataset into a tiny synthetic dataset. However, existing methods merely\nconcentrate on in-distribution (InD) classification in a closed-world setting,\ndisregarding out-of-distribution (OOD) samples. On the other hand, OOD\ndetection aims to enhance models' trustworthiness, which is always\ninefficiently achieved in full-data settings. For the first time, we\nsimultaneously consider both issues and propose a novel paradigm called\nTrustworthy Dataset Distillation (TrustDD). By distilling both InD samples and\noutliers, the condensed datasets are capable to train models competent in both\nInD classification and OOD detection. To alleviate the requirement of real\noutlier data and make OOD detection more practical, we further propose to\ncorrupt InD samples to generate pseudo-outliers and introduce Pseudo-Outlier\nExposure (POE). Comprehensive experiments on various settings demonstrate the\neffectiveness of TrustDD, and the proposed POE surpasses state-of-the-art\nmethod Outlier Exposure (OE). Compared with the preceding DD, TrustDD is more\ntrustworthy and applicable to real open-world scenarios. Our code will be\npublicly available.\n","authors":["Shijie Ma","Fei Zhu","Zhen Cheng","Xu-Yao Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09165v1.pdf","comment":"20 pages, 20 figures"},{"id":"http://arxiv.org/abs/2307.05520v2","updated":"2023-07-18T10:54:51Z","published":"2023-07-07T12:07:59Z","title":"Do DL models and training environments have an impact on energy\n consumption?","summary":" Current research in the computer vision field mainly focuses on improving\nDeep Learning (DL) correctness and inference time performance. However, there\nis still little work on the huge carbon footprint that has training DL models.\nThis study aims to analyze the impact of the model architecture and training\nenvironment when training greener computer vision models. We divide this goal\ninto two research questions. First, we analyze the effects of model\narchitecture on achieving greener models while keeping correctness at optimal\nlevels. Second, we study the influence of the training environment on producing\ngreener models. To investigate these relationships, we collect multiple metrics\nrelated to energy efficiency and model correctness during the models' training.\nThen, we outline the trade-offs between the measured energy efficiency and the\nmodels' correctness regarding model architecture, and their relationship with\nthe training environment. We conduct this research in the context of a computer\nvision system for image classification. In conclusion, we show that selecting\nthe proper model architecture and training environment can reduce energy\nconsumption dramatically (up to 98.83%) at the cost of negligible decreases in\ncorrectness. Also, we find evidence that GPUs should scale with the models'\ncomputational complexity for better energy efficiency.\n","authors":["Santiago del Rey","Silverio Martínez-Fernández","Luís Cruz","Xavier Franch"],"pdf_url":"https://arxiv.org/pdf/2307.05520v2.pdf","comment":"49th Euromicro Conference Series on Software Engineering and Advanced\n Applications (SEAA). 8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.09143v1","updated":"2023-07-18T10:52:24Z","published":"2023-07-18T10:52:24Z","title":"MVA2023 Small Object Detection Challenge for Spotting Birds: Dataset,\n Methods, and Results","summary":" Small Object Detection (SOD) is an important machine vision topic because (i)\na variety of real-world applications require object detection for distant\nobjects and (ii) SOD is a challenging task due to the noisy, blurred, and\nless-informative image appearances of small objects. This paper proposes a new\nSOD dataset consisting of 39,070 images including 137,121 bird instances, which\nis called the Small Object Detection for Spotting Birds (SOD4SB) dataset. The\ndetail of the challenge with the SOD4SB dataset is introduced in this paper. In\ntotal, 223 participants joined this challenge. This paper briefly introduces\nthe award-winning methods. The dataset, the baseline code, and the website for\nevaluation on the public testset are publicly available.\n","authors":["Yuki Kondo","Norimichi Ukita","Takayuki Yamaguchi","Hao-Yu Hou","Mu-Yi Shen","Chia-Chi Hsu","En-Ming Huang","Yu-Chen Huang","Yu-Cheng Xia","Chien-Yao Wang","Chun-Yi Lee","Da Huo","Marc A. Kastner","Tingwei Liu","Yasutomo Kawanishi","Takatsugu Hirayama","Takahiro Komamizu","Ichiro Ide","Yosuke Shinya","Xinyao Liu","Guang Liang","Syusuke Yasui"],"pdf_url":"https://arxiv.org/pdf/2307.09143v1.pdf","comment":"This paper is included in the proceedings of the 18th International\n Conference on Machine Vision Applications (MVA2023). It will be officially\n published at a later date. Project page :\n https://www.mva-org.jp/mva2023/challenge"},{"id":"http://arxiv.org/abs/2307.09142v1","updated":"2023-07-18T10:52:08Z","published":"2023-07-18T10:52:08Z","title":"Characterization of partial wetting by CMAS droplets using multiphase\n many-body dissipative particle dynamics and data-driven discovery based on\n PINNs","summary":" The molten sand, a mixture of calcia, magnesia, alumina, and silicate, known\nas CMAS, is characterized by its high viscosity, density, and surface tension.\nThe unique properties of CMAS make it a challenging material to deal with in\nhigh-temperature applications, requiring innovative solutions and materials to\nprevent its buildup and damage to critical equipment. Here, we use multiphase\nmany-body dissipative particle dynamics (mDPD) simulations to study the wetting\ndynamics of highly viscous molten CMAS droplets. The simulations are performed\nin three dimensions, with varying initial droplet sizes and equilibrium contact\nangles. We propose a coarse parametric ordinary differential equation (ODE)\nthat captures the spreading radius behavior of the CMAS droplets. The ODE\nparameters are then identified based on the Physics-Informed Neural Network\n(PINN) framework. Subsequently, the closed form dependency of parameter values\nfound by PINN on the initial radii and contact angles are given using symbolic\nregression. Finally, we employ Bayesian PINNs (B-PINNs) to assess and quantify\nthe uncertainty associated with the discovered parameters. In brief, this study\nprovides insight into spreading dynamics of CMAS droplets by fusing simple\nparametric ODE modeling and state-of-the-art machine learning techniques.\n","authors":["Elham Kiyani","Mahdi Kooshkbaghi","Khemraj Shukla","Rahul Babu Koneru","Zhen Li","Luis Bravo","Anindya Ghoshal","George Em Karniadakis","Mikko Karttunen"],"pdf_url":"https://arxiv.org/pdf/2307.09142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15548v3","updated":"2023-07-18T10:26:58Z","published":"2023-06-27T15:18:52Z","title":"Geometric Ultrasound Localization Microscopy","summary":" Contrast-Enhanced Ultra-Sound (CEUS) has become a viable method for\nnon-invasive, dynamic visualization in medical diagnostics, yet Ultrasound\nLocalization Microscopy (ULM) has enabled a revolutionary breakthrough by\noffering ten times higher resolution. To date, Delay-And-Sum (DAS) beamformers\nare used to render ULM frames, ultimately determining the image resolution\ncapability. To take full advantage of ULM, this study questions whether\nbeamforming is the most effective processing step for ULM, suggesting an\nalternative approach that relies solely on Time-Difference-of-Arrival (TDoA)\ninformation. To this end, a novel geometric framework for micro bubble\nlocalization via ellipse intersections is proposed to overcome existing\nbeamforming limitations. We present a benchmark comparison based on a public\ndataset for which our geometric ULM outperforms existing baseline methods in\nterms of accuracy and robustness while only utilizing a portion of the\navailable transducer data.\n","authors":["Christopher Hahne","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2306.15548v3.pdf","comment":"Pre-print accepted for MICCAI 2023"},{"id":"http://arxiv.org/abs/2207.02149v2","updated":"2023-07-18T10:09:13Z","published":"2022-06-27T14:01:06Z","title":"Stochastic Optimal Control for Collective Variable Free Sampling of\n Molecular Transition Paths","summary":" We consider the problem of sampling transition paths between two given\nmetastable states of a molecular system, e.g. a folded and unfolded protein or\nproducts and reactants of a chemical reaction. Due to the existence of high\nenergy barriers separating the states, these transition paths are unlikely to\nbe sampled with standard Molecular Dynamics (MD) simulation. Traditional\nmethods to augment MD with a bias potential to increase the probability of the\ntransition rely on a dimensionality reduction step based on Collective\nVariables (CVs). Unfortunately, selecting appropriate CVs requires chemical\nintuition and traditional methods are therefore not always applicable to larger\nsystems. Additionally, when incorrect CVs are used, the bias potential might\nnot be minimal and bias the system along dimensions irrelevant to the\ntransition. Showing a formal relation between the problem of sampling molecular\ntransition paths, the Schr\\\"odinger bridge problem and stochastic optimal\ncontrol with neural network policies, we propose a machine learning method for\nsampling said transitions. Unlike previous non-machine learning approaches our\nmethod, named PIPS, does not depend on CVs. We show that our method successful\ngenerates low energy transitions for Alanine Dipeptide as well as the larger\nPolyproline and Chignolin proteins.\n","authors":["Lars Holdijk","Yuanqi Du","Ferry Hooft","Priyank Jaini","Bernd Ensing","Max Welling"],"pdf_url":"https://arxiv.org/pdf/2207.02149v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09109v1","updated":"2023-07-18T09:58:15Z","published":"2023-07-18T09:58:15Z","title":"Mining of Single-Class by Active Learning for Semantic Segmentation","summary":" Several Active Learning (AL) policies require retraining a target model\nseveral times in order to identify the most informative samples and rarely\noffer the option to focus on the acquisition of samples from underrepresented\nclasses. Here the Mining of Single-Class by Active Learning (MiSiCAL) paradigm\nis introduced where an AL policy is constructed through deep reinforcement\nlearning and exploits quantity-accuracy correlations to build datasets on which\nhigh-performance models can be trained with regards to specific classes.\nMiSiCAL is especially helpful in the case of very large batch sizes since it\ndoes not require repeated model training sessions as is common in other AL\nmethods. This is thanks to its ability to exploit fixed representations of the\ncandidate data points. We find that MiSiCAL is able to outperform a random\npolicy on 150 out of 171 COCO10k classes, while the strongest baseline only\noutperforms random on 101 classes.\n","authors":["Hugues Lambert","Emma Slade"],"pdf_url":"https://arxiv.org/pdf/2307.09109v1.pdf","comment":"29 pages, 14 figures, 2 tables"},{"id":"http://arxiv.org/abs/2212.10426v5","updated":"2023-07-18T09:45:53Z","published":"2022-12-20T17:04:50Z","title":"Deep Riemannian Networks for EEG Decoding","summary":" State-of-the-art performance in electroencephalography (EEG) decoding tasks\nis currently often achieved with either Deep-Learning (DL) or\nRiemannian-Geometry-based decoders (RBDs). Recently, there is growing interest\nin Deep Riemannian Networks (DRNs) possibly combining the advantages of both\nprevious classes of methods. However, there are still a range of topics where\nadditional insight is needed to pave the way for a more widespread application\nof DRNs in EEG. These include architecture design questions such as network\nsize and end-to-end ability.How these factors affect model performance has not\nbeen explored. Additionally, it is not clear how the data within these networks\nis transformed, and whether this would correlate with traditional EEG decoding.\nOur study aims to lay the groundwork in the area of these topics through the\nanalysis of DRNs for EEG with a wide range of hyperparameters. Networks were\ntested on two public EEG datasets and compared with state-of-the-art ConvNets.\nHere we propose end-to-end EEG SPDNet (EE(G)-SPDNet), and we show that this\nwide, end-to-end DRN can outperform the ConvNets, and in doing so use\nphysiologically plausible frequency regions. We also show that the end-to-end\napproach learns more complex filters than traditional band-pass filters\ntargeting the classical alpha, beta, and gamma frequency bands of the EEG, and\nthat performance can benefit from channel specific filtering approaches.\nAdditionally, architectural analysis revealed areas for further improvement due\nto the possible loss of Riemannian specific information throughout the network.\nOur study thus shows how to design and train DRNs to infer task-related\ninformation from the raw EEG without the need of handcrafted filterbanks and\nhighlights the potential of end-to-end DRNs such as EE(G)-SPDNet for\nhigh-performance EEG decoding.\n","authors":["Daniel Wilson","Robin Tibor Schirrmeister","Lukas Alexander Wilhelm Gemein","Tonio Ball"],"pdf_url":"https://arxiv.org/pdf/2212.10426v5.pdf","comment":"27 pages, 13 Figures"},{"id":"http://arxiv.org/abs/2305.07617v2","updated":"2023-07-18T09:40:34Z","published":"2023-05-12T17:09:34Z","title":"Scalable Coupling of Deep Learning with Logical Reasoning","summary":" In the ongoing quest for hybridizing discrete reasoning with neural nets,\nthere is an increasing interest in neural architectures that can learn how to\nsolve discrete reasoning or optimization problems from natural inputs. In this\npaper, we introduce a scalable neural architecture and loss function dedicated\nto learning the constraints and criteria of NP-hard reasoning problems\nexpressed as discrete Graphical Models. Our loss function solves one of the\nmain limitations of Besag's pseudo-loglikelihood, enabling learning of high\nenergies. We empirically show it is able to efficiently learn how to solve\nNP-hard reasoning problems from natural inputs as the symbolic, visual or\nmany-solutions Sudoku problems as well as the energy optimization formulation\nof the protein design problem, providing data efficiency, interpretability, and\n\\textit{a posteriori} control over predictions.\n","authors":["Marianne Defresne","Sophie Barbe","Thomas Schiex"],"pdf_url":"https://arxiv.org/pdf/2305.07617v2.pdf","comment":"10 pages, 2 figures, 6 tables. Published in IJCAI'2023 proceedings"},{"id":"http://arxiv.org/abs/2307.09093v1","updated":"2023-07-18T09:22:33Z","published":"2023-07-18T09:22:33Z","title":"Non-stationary Delayed Combinatorial Semi-Bandit with Causally Related\n Rewards","summary":" Sequential decision-making under uncertainty is often associated with long\nfeedback delays. Such delays degrade the performance of the learning agent in\nidentifying a subset of arms with the optimal collective reward in the long\nrun. This problem becomes significantly challenging in a non-stationary\nenvironment with structural dependencies amongst the reward distributions\nassociated with the arms. Therefore, besides adapting to delays and\nenvironmental changes, learning the causal relations alleviates the adverse\neffects of feedback delay on the decision-making process. We formalize the\ndescribed setting as a non-stationary and delayed combinatorial semi-bandit\nproblem with causally related rewards. We model the causal relations by a\ndirected graph in a stationary structural equation model. The agent maximizes\nthe long-term average payoff, defined as a linear function of the base arms'\nrewards. We develop a policy that learns the structural dependencies from\ndelayed feedback and utilizes that to optimize the decision-making while\nadapting to drifts. We prove a regret bound for the performance of the proposed\nalgorithm. Besides, we evaluate our method via numerical analysis using\nsynthetic and real-world datasets to detect the regions that contribute the\nmost to the spread of Covid-19 in Italy.\n","authors":["Saeed Ghoorchian","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2307.09093v1.pdf","comment":"33 pages, 9 figures. arXiv admin note: text overlap with\n arXiv:2212.12923"},{"id":"http://arxiv.org/abs/2305.09211v2","updated":"2023-07-18T09:21:27Z","published":"2023-05-16T06:40:04Z","title":"CB-HVTNet: A channel-boosted hybrid vision transformer network for\n lymphocyte assessment in histopathological images","summary":" Transformers, due to their ability to learn long range dependencies, have\novercome the shortcomings of convolutional neural networks (CNNs) for global\nperspective learning. Therefore, they have gained the focus of researchers for\nseveral vision related tasks including medical diagnosis. However, their\nmulti-head attention module only captures global level feature representations,\nwhich is insufficient for medical images. To address this issue, we propose a\nChannel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning\nto generate boosted channels and employs both transformers and CNNs to analyse\nlymphocytes in histopathological images. The proposed CB HVT comprises five\nmodules, including a channel generation module, channel exploitation module,\nchannel merging module, region-aware module, and a detection and segmentation\nhead, which work together to effectively identify lymphocytes. The channel\ngeneration module uses the idea of channel boosting through transfer learning\nto extract diverse channels from different auxiliary learners. In the CB HVT,\nthese boosted channels are first concatenated and ranked using an attention\nmechanism in the channel exploitation module. A fusion block is then utilized\nin the channel merging module for a gradual and systematic merging of the\ndiverse boosted channels to improve the network's learning representations. The\nCB HVT also employs a proposal network in its region aware module and a head to\neffectively identify objects, even in overlapping regions and with artifacts.\nWe evaluated the proposed CB HVT on two publicly available datasets for\nlymphocyte assessment in histopathological images. The results show that CB HVT\noutperformed other state of the art detection models, and has good\ngeneralization ability, demonstrating its value as a tool for pathologists.\n","authors":["Momina Liaqat Ali","Zunaira Rauf","Asifullah Khan","Anabia Sohail","Rafi Ullah","Jeonghwan Gwak"],"pdf_url":"https://arxiv.org/pdf/2305.09211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08572v2","updated":"2023-07-18T09:16:28Z","published":"2023-07-17T15:38:11Z","title":"Revisiting the Robustness of the Minimum Error Entropy Criterion: A\n Transfer Learning Case Study","summary":" Coping with distributional shifts is an important part of transfer learning\nmethods in order to perform well in real-life tasks. However, most of the\nexisting approaches in this area either focus on an ideal scenario in which the\ndata does not contain noises or employ a complicated training paradigm or model\ndesign to deal with distributional shifts. In this paper, we revisit the\nrobustness of the minimum error entropy (MEE) criterion, a widely used\nobjective in statistical signal processing to deal with non-Gaussian noises,\nand investigate its feasibility and usefulness in real-life transfer learning\nregression tasks, where distributional shifts are common. Specifically, we put\nforward a new theoretical result showing the robustness of MEE against\ncovariate shift. We also show that by simply replacing the mean squared error\n(MSE) loss with the MEE on basic transfer learning algorithms such as\nfine-tuning and linear probing, we can achieve competitive performance with\nrespect to state-of-the-art transfer learning algorithms. We justify our\narguments on both synthetic data and 5 real-world time-series data.\n","authors":["Luis Pedro Silvestrin","Shujian Yu","Mark Hoogendoorn"],"pdf_url":"https://arxiv.org/pdf/2307.08572v2.pdf","comment":"Manuscript accepted at ECAI-23. Code available at\n https://github.com/lpsilvestrin/mee-finetune"},{"id":"http://arxiv.org/abs/2307.08466v2","updated":"2023-07-18T09:16:10Z","published":"2023-07-17T13:21:02Z","title":"Generalizable Classification of UHF Partial Discharge Signals in\n Gas-Insulated HVDC Systems Using Neural Networks","summary":" Undetected partial discharges (PDs) are a safety critical issue in high\nvoltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC\nvoltage is well-established, the analysis of PDs under DC voltage remains an\nactive research field. A key focus of these investigations is the\nclassification of different PD sources to enable subsequent sophisticated\nanalysis.\n In this paper, we propose and analyze a neural network-based approach for\nclassifying PD signals caused by metallic protrusions and conductive particles\non the insulator of HVDC GIS, without relying on pulse sequence analysis\nfeatures. In contrast to previous approaches, our proposed model can\ndiscriminate the studied PD signals obtained at negative and positive\npotentials, while also generalizing to unseen operating voltage multiples.\nAdditionally, we compare the performance of time- and frequency-domain input\nsignals and explore the impact of different normalization schemes to mitigate\nthe influence of free-space path loss between the sensor and defect location.\n","authors":["Steffen Seitz","Thomas Götz","Christopher Lindenberg","Ronald Tetzlaff","Stephan Schlegel"],"pdf_url":"https://arxiv.org/pdf/2307.08466v2.pdf","comment":"8 pages, submitted to IEEE Transactions on Power Delivery"},{"id":"http://arxiv.org/abs/2208.06868v3","updated":"2023-07-18T09:00:57Z","published":"2022-08-14T15:25:41Z","title":"Frouros: A Python library for drift detection in machine learning\n systems","summary":" Frouros is an open-source Python library capable of detecting drift in\nmachine learning systems. It provides a combination of classical and more\nrecent algorithms for drift detection: both concept and data drift. We have\ndesigned it with the objective of making it compatible with any machine\nlearning framework and easily adaptable to real-world use cases. The library is\ndeveloped following a set of best development and continuous integration\npractices to ensure ease of maintenance and extensibility. The source code is\navailable at https://github.com/IFCA/frouros.\n","authors":["Jaime Céspedes-Sisniega","Álvaro López-García"],"pdf_url":"https://arxiv.org/pdf/2208.06868v3.pdf","comment":"11 pages, 1 table"},{"id":"http://arxiv.org/abs/2307.09080v1","updated":"2023-07-18T09:00:26Z","published":"2023-07-18T09:00:26Z","title":"A Federated learning model for Electric Energy management using\n Blockchain Technology","summary":" Energy shortfall and electricity load shedding are the main problems for\ndeveloping countries. The main causes are lack of management in the energy\nsector and the use of non-renewable energy sources. The improved energy\nmanagement and use of renewable sources can be significant to resolve energy\ncrisis. It is necessary to increase the use of renewable energy sources (RESs)\nto meet the increasing energy demand due to high prices of fossil-fuel based\nenergy. Federated learning (FL) is the most emerging technique in the field of\nartificial intelligence. Federated learning helps to generate global model at\nserver side by ensemble locally trained models at remote edges sites while\npreserving data privacy. The global model used to predict energy demand to\nsatisfy the needs of consumers. In this article, we have proposed Blockchain\nbased safe distributed ledger technology for transaction of data between\nprosumer and consumer to ensure their transparency, traceability and security.\nFurthermore, we have also proposed a Federated learning model to forecast the\nenergy requirements of consumer and prosumer. Moreover, Blockchain has been\nused to store excess energy data from prosumer for better management of energy\nbetween prosumer and grid. Lastly, the experiment results revealed that\nrenewable energy sources have produced better and comparable results to other\nnon-renewable energy resources.\n","authors":["Muhammad Shoaib Farooq","Azeen Ahmed Hayat"],"pdf_url":"https://arxiv.org/pdf/2307.09080v1.pdf","comment":"14 figures, 7 tables, 15 pages"},{"id":"http://arxiv.org/abs/2205.08790v3","updated":"2023-07-18T09:00:25Z","published":"2022-05-18T08:32:26Z","title":"On-device modeling of user's social context and familiar places from\n smartphone-embedded sensor data","summary":" Context modeling and recognition represent complex tasks that allow mobile\nand ubiquitous computing applications to adapt to the user's situation. Current\nsolutions mainly focus on limited context information generally processed on\ncentralized architectures, potentially exposing users' personal data to privacy\nleakage, and missing personalization features. For these reasons on-device\ncontext modeling and recognition represent the current research trend in this\narea. Among the different information characterizing the user's context in\nmobile environments, social interactions and visited locations remarkably\ncontribute to the characterization of daily life scenarios. In this paper we\npropose a novel, unsupervised and lightweight approach to model the user's\nsocial context and her locations based on ego networks directly on the user\nmobile device. Relying on this model, the system is able to extract high-level\nand semantic-rich context features from smartphone-embedded sensors data.\nSpecifically, for the social context it exploits data related to both physical\nand cyber social interactions among users and their devices. As far as location\ncontext is concerned, we assume that it is more relevant to model the\nfamiliarity degree of a specific location for the user's context than the raw\nlocation data, both in terms of GPS coordinates and proximity devices. By using\n5 real-world datasets, we assess the structure of the social and location ego\nnetworks, we provide a semantic evaluation of the proposed models and a\ncomplexity evaluation in terms of mobile computing performance. Finally, we\ndemonstrate the relevance of the extracted features by showing the performance\nof 3 machine learning algorithms to recognize daily-life situations, obtaining\nan improvement of 3% of AUROC, 9% of Precision, and 5% in terms of Recall with\nrespect to use only features related to physical context.\n","authors":["Mattia Giovanni Campana","Franca Delmastro"],"pdf_url":"https://arxiv.org/pdf/2205.08790v3.pdf","comment":"I request the withdrawal of the paper because it has been already\n submitted (and published) on arXiv with identifier 2306.15437"},{"id":"http://arxiv.org/abs/2307.08433v2","updated":"2023-07-18T08:56:09Z","published":"2023-07-17T12:25:52Z","title":"From random-walks to graph-sprints: a low-latency node embedding\n framework on continuous-time dynamic graphs","summary":" Many real-world datasets have an underlying dynamic graph structure, where\nentities and their interactions evolve over time. Machine learning models\nshould consider these dynamics in order to harness their full potential in\ndownstream tasks. Previous approaches for graph representation learning have\nfocused on either sampling k-hop neighborhoods, akin to breadth-first search,\nor random walks, akin to depth-first search. However, these methods are\ncomputationally expensive and unsuitable for real-time, low-latency inference\non dynamic graphs. To overcome these limitations, we propose graph-sprints a\ngeneral purpose feature extraction framework for continuous-time-dynamic-graphs\n(CTDGs) that has low latency and is competitive with state-of-the-art, higher\nlatency models. To achieve this, a streaming, low latency approximation to the\nrandom-walk based features is proposed. In our framework, time-aware node\nembeddings summarizing multi-hop information are computed using only single-hop\noperations on the incoming edges. We evaluate our proposed approach on three\nopen-source datasets and two in-house datasets, and compare with three\nstate-of-the-art algorithms (TGN-attn, TGN-ID, Jodie). We demonstrate that our\ngraph-sprints features, combined with a machine learning classifier, achieve\ncompetitive performance (outperforming all baselines for the node\nclassification tasks in five datasets). Simultaneously, graph-sprints\nsignificantly reduce inference latencies, achieving close to an order of\nmagnitude speed-up in our experimental setting.\n","authors":["Ahmad Naser Eddin","Jacopo Bono","David Aparício","Hugo Ferreira","João Ascensão","Pedro Ribeiro","Pedro Bizarro"],"pdf_url":"https://arxiv.org/pdf/2307.08433v2.pdf","comment":"9 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2211.04965v2","updated":"2023-07-18T08:56:00Z","published":"2022-11-09T15:29:03Z","title":"Resource frugal optimizer for quantum machine learning","summary":" Quantum-enhanced data science, also known as quantum machine learning (QML),\nis of growing interest as an application of near-term quantum computers.\nVariational QML algorithms have the potential to solve practical problems on\nreal hardware, particularly when involving quantum data. However, training\nthese algorithms can be challenging and calls for tailored optimization\nprocedures. Specifically, QML applications can require a large shot-count\noverhead due to the large datasets involved. In this work, we advocate for\nsimultaneous random sampling over both the dataset as well as the measurement\noperators that define the loss function. We consider a highly general loss\nfunction that encompasses many QML applications, and we show how to construct\nan unbiased estimator of its gradient. This allows us to propose a shot-frugal\ngradient descent optimizer called Refoqus (REsource Frugal Optimizer for\nQUantum Stochastic gradient descent). Our numerics indicate that Refoqus can\nsave several orders of magnitude in shot cost, even relative to optimizers that\nsample over measurement operators alone.\n","authors":["Charles Moussa","Max Hunter Gordon","Michal Baczyk","M. Cerezo","Lukasz Cincio","Patrick J. Coles"],"pdf_url":"https://arxiv.org/pdf/2211.04965v2.pdf","comment":"22 pages, 6 figures - extra quantum autoencoder results added"},{"id":"http://arxiv.org/abs/2307.09072v1","updated":"2023-07-18T08:45:54Z","published":"2023-07-18T08:45:54Z","title":"DiTTO: Diffusion-inspired Temporal Transformer Operator","summary":" Solving partial differential equations (PDEs) using a data-driven approach\nhas become increasingly common. The recent development of the operator learning\nparadigm has enabled the solution of a broader range of PDE-related problems.\nWe propose an operator learning method to solve time-dependent PDEs\ncontinuously in time without needing any temporal discretization. The proposed\napproach, named DiTTO, is inspired by latent diffusion models. While diffusion\nmodels are usually used in generative artificial intelligence tasks, their\ntime-conditioning mechanism is extremely useful for PDEs. The\ndiffusion-inspired framework is combined with elements from the Transformer\narchitecture to improve its capabilities.\n We demonstrate the effectiveness of the new approach on a wide variety of\nPDEs in multiple dimensions, namely the 1-D Burgers' equation, 2-D\nNavier-Stokes equations, and the acoustic wave equation in 2-D and 3-D. DiTTO\nachieves state-of-the-art results in terms of accuracy for these problems. We\nalso present a method to improve the performance of DiTTO by using fast\nsampling concepts from diffusion models. Finally, we show that DiTTO can\naccurately perform zero-shot super-resolution in time.\n","authors":["Oded Ovadia","Eli Turkel","Adar Kahana","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2307.09072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04964v2","updated":"2023-07-18T08:44:47Z","published":"2023-07-11T01:55:24Z","title":"Secrets of RLHF in Large Language Models Part I: PPO","summary":" Large language models (LLMs) have formulated a blueprint for the advancement\nof artificial general intelligence. Its primary objective is to function as a\nhuman-centric (helpful, honest, and harmless) assistant. Alignment with humans\nassumes paramount significance, and reinforcement learning with human feedback\n(RLHF) emerges as the pivotal technological paradigm underpinning this pursuit.\nCurrent technical routes usually include \\textbf{reward models} to measure\nhuman preferences, \\textbf{Proximal Policy Optimization} (PPO) to optimize\npolicy model outputs, and \\textbf{process supervision} to improve step-by-step\nreasoning capabilities. However, due to the challenges of reward design,\nenvironment interaction, and agent training, coupled with huge trial and error\ncost of large language models, there is a significant barrier for AI\nresearchers to motivate the development of technical alignment and safe landing\nof LLMs. The stable training of RLHF has still been a puzzle. In the first\nreport, we dissect the framework of RLHF, re-evaluate the inner workings of\nPPO, and explore how the parts comprising PPO algorithms impact policy agent\ntraining. We identify policy constraints being the key factor for the effective\nimplementation of the PPO algorithm. Therefore, we explore the PPO-max, an\nadvanced version of PPO algorithm, to efficiently improve the training\nstability of the policy model. Based on our main results, we perform a\ncomprehensive analysis of RLHF abilities compared with SFT models and ChatGPT.\nThe absence of open-source implementations has posed significant challenges to\nthe investigation of LLMs alignment. Therefore, we are eager to release\ntechnical reports, reward models and PPO codes, aiming to make modest\ncontributions to the advancement of LLMs.\n","authors":["Rui Zheng","Shihan Dou","Songyang Gao","Yuan Hua","Wei Shen","Binghai Wang","Yan Liu","Senjie Jin","Qin Liu","Yuhao Zhou","Limao Xiong","Lu Chen","Zhiheng Xi","Nuo Xu","Wenbin Lai","Minghao Zhu","Cheng Chang","Zhangyue Yin","Rongxiang Weng","Wensen Cheng","Haoran Huang","Tianxiang Sun","Hang Yan","Tao Gui","Qi Zhang","Xipeng Qiu","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2307.04964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09067v1","updated":"2023-07-18T08:37:58Z","published":"2023-07-18T08:37:58Z","title":"Evaluate Fine-tuning Strategies for Fetal Head Ultrasound Image\n Segmentation with U-Net","summary":" Fetal head segmentation is a crucial step in measuring the fetal head\ncircumference (HC) during gestation, an important biometric in obstetrics for\nmonitoring fetal growth. However, manual biometry generation is time-consuming\nand results in inconsistent accuracy. To address this issue, convolutional\nneural network (CNN) models have been utilized to improve the efficiency of\nmedical biometry. But training a CNN network from scratch is a challenging\ntask, we proposed a Transfer Learning (TL) method. Our approach involves\nfine-tuning (FT) a U-Net network with a lightweight MobileNet as the encoder to\nperform segmentation on a set of fetal head ultrasound (US) images with limited\neffort. This method addresses the challenges associated with training a CNN\nnetwork from scratch. It suggests that our proposed FT strategy yields\nsegmentation performance that is comparable when trained with a reduced number\nof parameters by 85.8%. And our proposed FT strategy outperforms other\nstrategies with smaller trainable parameter sizes below 4.4 million. Thus, we\ncontend that it can serve as a dependable FT approach for reducing the size of\nmodels in medical image analysis. Our key findings highlight the importance of\nthe balance between model performance and size in developing Artificial\nIntelligence (AI) applications by TL methods. Code is available at\nhttps://github.com/13204942/FT_Methods_for_Fetal_Head_Segmentation.\n","authors":["Fangyijie Wang","Guénolé Silvestre","Kathleen M. Curran"],"pdf_url":"https://arxiv.org/pdf/2307.09067v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.09065v1","updated":"2023-07-18T08:37:25Z","published":"2023-07-18T08:37:25Z","title":"Learning Adaptive Neighborhoods for Graph Neural Networks","summary":" Graph convolutional networks (GCNs) enable end-to-end learning on graph\nstructured data. However, many works assume a given graph structure. When the\ninput graph is noisy or unavailable, one approach is to construct or learn a\nlatent graph structure. These methods typically fix the choice of node degree\nfor the entire graph, which is suboptimal. Instead, we propose a novel\nend-to-end differentiable graph generator which builds graph topologies where\neach node selects both its neighborhood and its size. Our module can be readily\nintegrated into existing pipelines involving graph convolution operations,\nreplacing the predetermined or existing adjacency matrix with one that is\nlearned, and optimized, as part of the general objective. As such it is\napplicable to any GCN. We integrate our module into trajectory prediction,\npoint cloud classification and node classification pipelines resulting in\nimproved accuracy over other structure-learning methods across a wide range of\ndatasets and GCN backbones.\n","authors":["Avishkar Saha","Oscar Mendez","Chris Russell","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2307.09065v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09060v1","updated":"2023-07-18T08:25:14Z","published":"2023-07-18T08:25:14Z","title":"Extreme heatwave sampling and prediction with analog Markov chain and\n comparisons with deep learning","summary":" We present a data-driven emulator, stochastic weather generator (SWG),\nsuitable for estimating probabilities of prolonged heatwaves in France and\nScandinavia. This emulator is based on the method of analogs of circulation to\nwhich we add temperature and soil moisture as predictor fields. We train the\nemulator on an intermediate complexity climate model run and show that it is\ncapable of predicting conditional probabilities (forecasting) of heatwaves out\nof sample. Special attention is payed that this prediction is evaluated using\nproper score appropriate for rare events. To accelerate the computation of\nanalogs dimensionality reduction techniques are applied and the performance is\nevaluated. The probabilistic prediction achieved with SWG is compared with the\none achieved with\n Convolutional Neural Network (CNN). With the availability of hundreds of\nyears of training data CNNs perform better at the task of probabilistic\nprediction. In addition, we show that the SWG emulator trained on 80 years of\ndata is capable of estimating extreme return times of order of thousands of\nyears for heatwaves longer than several days more precisely than the fit based\non generalised extreme value distribution. Finally, the quality of its\nsynthetic extreme teleconnection patterns obtained with stochastic weather\ngenerator is studied. We showcase two examples of such synthetic teleconnection\npatterns for heatwaves in France and Scandinavia that compare favorably to the\nvery long climate model control run.\n","authors":["George Miloshevich","Dario Lucente","Pascal Yiou","Freddy Bouchet"],"pdf_url":"https://arxiv.org/pdf/2307.09060v1.pdf","comment":"29 pages, 13 figures, presented at Climate Informatics 2023, UK\n Cambridge"},{"id":"http://arxiv.org/abs/2307.09057v1","updated":"2023-07-18T08:20:56Z","published":"2023-07-18T08:20:56Z","title":"Globally solving the Gromov-Wasserstein problem for point clouds in low\n dimensional Euclidean spaces","summary":" This paper presents a framework for computing the Gromov-Wasserstein problem\nbetween two sets of points in low dimensional spaces, where the discrepancy is\nthe squared Euclidean norm. The Gromov-Wasserstein problem is a generalization\nof the optimal transport problem that finds the assignment between two sets\npreserving pairwise distances as much as possible. This can be used to quantify\nthe similarity between two formations or shapes, a common problem in AI and\nmachine learning. The problem can be formulated as a Quadratic Assignment\nProblem (QAP), which is in general computationally intractable even for small\nproblems. Our framework addresses this challenge by reformulating the QAP as an\noptimization problem with a low-dimensional domain, leveraging the fact that\nthe problem can be expressed as a concave quadratic optimization problem with\nlow rank. The method scales well with the number of points, and it can be used\nto find the global solution for large-scale problems with thousands of points.\nWe compare the computational complexity of our approach with state-of-the-art\nmethods on synthetic problems and apply it to a near-symmetrical problem which\nis of particular interest in computational biology.\n","authors":["Martin Ryner","Jan Kronqvist","Johan Karlsson"],"pdf_url":"https://arxiv.org/pdf/2307.09057v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.16044v5","updated":"2023-07-18T08:13:33Z","published":"2023-05-25T13:21:26Z","title":"Exploiting Noise as a Resource for Computation and Learning in Spiking\n Neural Networks","summary":" Networks of spiking neurons underpin the extraordinary information-processing\ncapabilities of the brain and have become pillar models in neuromorphic\nartificial intelligence. Despite extensive research on spiking neural networks\n(SNNs), most studies are established on deterministic models, overlooking the\ninherent non-deterministic, noisy nature of neural computations. This study\nintroduces the noisy spiking neural network (NSNN) and the noise-driven\nlearning rule (NDL) by incorporating noisy neuronal dynamics to exploit the\ncomputational advantages of noisy neural processing. NSNN provides a\ntheoretical framework that yields scalable, flexible, and reliable computation.\nWe demonstrate that NSNN leads to spiking neural models with competitive\nperformance, improved robustness against challenging perturbations than\ndeterministic SNNs, and better reproducing probabilistic neural computation in\nneural coding. This study offers a powerful and easy-to-use tool for machine\nlearning, neuromorphic intelligence practitioners, and computational\nneuroscience researchers.\n","authors":["Gehua Ma","Rui Yan","Huajin Tang"],"pdf_url":"https://arxiv.org/pdf/2305.16044v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09055v1","updated":"2023-07-18T08:11:08Z","published":"2023-07-18T08:11:08Z","title":"Outlier-Robust Tensor Low-Rank Representation for Data Clustering","summary":" Low-rank tensor analysis has received widespread attention with many\npractical applications. However, the tensor data are often contaminated by\noutliers or sample-specific corruptions. How to recover the tensor data that\nare corrupted by outliers and perform data clustering remains a challenging\nproblem. This paper develops an outlier-robust tensor low-rank representation\n(OR-TLRR) method for simultaneous outlier detection and tensor data clustering\nbased on the tensor singular value decomposition (t-SVD) algebraic framework.\nIt is motivated by the recently proposed tensor-tensor product induced by\ninvertible linear transforms that satisfy certain conditions. For tensor\nobservations with arbitrary outlier corruptions, OR-TLRR has provable\nperformance guarantee for exactly recovering the row space of clean data and\ndetecting outliers under mild conditions. Moreover, an extension of OR-TLRR is\nalso proposed to handle the case when parts of the data are missing. Finally,\nextensive experimental results on both synthetic and real data demonstrate the\neffectiveness of the proposed algorithms.\n","authors":["Tong Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09055v1.pdf","comment":"12 pages, 1 figure; preprint of a journal paper"},{"id":"http://arxiv.org/abs/2305.11997v2","updated":"2023-07-18T07:54:31Z","published":"2023-05-19T20:48:05Z","title":"Robust Counterfactual Explanations for Neural Networks With\n Probabilistic Guarantees","summary":" There is an emerging interest in generating robust counterfactual\nexplanations that would remain valid if the model is updated or changed even\nslightly. Towards finding robust counterfactuals, existing literature often\nassumes that the original model $m$ and the new model $M$ are bounded in the\nparameter space, i.e., $\\|\\text{Params}(M){-}\\text{Params}(m)\\|{<}\\Delta$.\nHowever, models can often change significantly in the parameter space with\nlittle to no change in their predictions or accuracy on the given dataset. In\nthis work, we introduce a mathematical abstraction termed\n\\emph{naturally-occurring} model change, which allows for arbitrary changes in\nthe parameter space such that the change in predictions on points that lie on\nthe data manifold is limited. Next, we propose a measure -- that we call\n\\emph{Stability} -- to quantify the robustness of counterfactuals to potential\nmodel changes for differentiable models, e.g., neural networks. Our main\ncontribution is to show that counterfactuals with sufficiently high value of\n\\emph{Stability} as defined by our measure will remain valid after potential\n``naturally-occurring'' model changes with high probability (leveraging\nconcentration bounds for Lipschitz function of independent Gaussians). Since\nour quantification depends on the local Lipschitz constant around a data point\nwhich is not always available, we also examine practical relaxations of our\nproposed measure and demonstrate experimentally how they can be incorporated to\nfind robust counterfactuals for neural networks that are close, realistic, and\nremain valid after potential model changes. This work also has interesting\nconnections with model multiplicity, also known as, the Rashomon effect.\n","authors":["Faisal Hamman","Erfaun Noorani","Saumitra Mishra","Daniele Magazzeni","Sanghamitra Dutta"],"pdf_url":"https://arxiv.org/pdf/2305.11997v2.pdf","comment":"International Conference on Machine Learning (ICML), 2023"},{"id":"http://arxiv.org/abs/2210.01426v2","updated":"2023-07-18T07:49:36Z","published":"2022-10-04T07:34:06Z","title":"Continuous Monte Carlo Graph Search","summary":" In many complex sequential decision-making tasks, online planning is crucial\nfor high performance. For efficient online planning, Monte Carlo Tree Search\n(MCTS) employs a principled mechanism for trading off exploration for\nexploitation. MCTS outperforms comparison methods in many discrete\ndecision-making domains such as Go, Chess, and Shogi. Following, extensions of\nMCTS to continuous domains have been proposed. However, the inherent high\nbranching factor and the resulting explosion of search tree size are limiting\nexisting methods. To address this problem, we propose Continuous Monte Carlo\nGraph Search (CMCGS), a novel extension of MCTS to online planning in\nenvironments with continuous state and action spaces. CMCGS takes advantage of\nthe insight that, during planning, sharing the same action policy between\nseveral states can yield high performance. To implement this idea, at each time\nstep, CMCGS clusters similar states into a limited number of stochastic action\nbandit nodes, which produce a layered directed graph instead of an MCTS search\ntree. Experimental evaluation shows that CMCGS outperforms comparable planning\nmethods in several complex continuous DeepMind Control Suite benchmarks and a\n2D navigation task with limited sample budgets. Furthermore, CMCGS can be\nparallelized to scale up and it outperforms the Cross-Entropy Method (CEM) in\ncontinuous control with learned dynamics models.\n","authors":["Kalle Kujanpää","Amin Babadi","Yi Zhao","Juho Kannala","Alexander Ilin","Joni Pajarinen"],"pdf_url":"https://arxiv.org/pdf/2210.01426v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.09025v1","updated":"2023-07-18T07:34:02Z","published":"2023-07-18T07:34:02Z","title":"qecGPT: decoding Quantum Error-correcting Codes with Generative\n Pre-trained Transformers","summary":" We propose a general framework for decoding quantum error-correcting codes\nwith generative modeling. The model utilizes autoregressive neural networks,\nspecifically Transformers, to learn the joint probability of logical operators\nand syndromes. This training is in an unsupervised way, without the need for\nlabeled training data, and is thus referred to as pre-training. After the\npre-training, the model can efficiently compute the likelihood of logical\noperators for any given syndrome, using maximum likelihood decoding. It can\ndirectly generate the most-likely logical operators with computational\ncomplexity $\\mathcal O(2k)$ in the number of logical qubits $k$, which is\nsignificantly better than the conventional maximum likelihood decoding\nalgorithms that require $\\mathcal O(4^k)$ computation. Based on the pre-trained\nmodel, we further propose refinement to achieve more accurately the likelihood\nof logical operators for a given syndrome by directly sampling the stabilizer\noperators. We perform numerical experiments on stabilizer codes with small code\ndistances, using both depolarizing error models and error models with\ncorrelated noise. The results show that our approach provides significantly\nbetter decoding accuracy than the minimum weight perfect matching and\nbelief-propagation-based algorithms. Our framework is general and can be\napplied to any error model and quantum codes with different topologies such as\nsurface codes and quantum LDPC codes. Furthermore, it leverages the\nparallelization capabilities of GPUs, enabling simultaneous decoding of a large\nnumber of syndromes. Our approach sheds light on the efficient and accurate\ndecoding of quantum error-correcting codes using generative artificial\nintelligence and modern computational power.\n","authors":["Hanyan Cao","Feng Pan","Yijia Wang","Pan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09025v1.pdf","comment":"Comments are welcome"},{"id":"http://arxiv.org/abs/2307.07250v2","updated":"2023-07-18T07:31:34Z","published":"2023-07-14T09:51:26Z","title":"Mitigating Adversarial Vulnerability through Causal Parameter Estimation\n by Adversarial Double Machine Learning","summary":" Adversarial examples derived from deliberately crafted perturbations on\nvisual inputs can easily harm decision process of deep neural networks. To\nprevent potential threats, various adversarial training-based defense methods\nhave grown rapidly and become a de facto standard approach for robustness.\nDespite recent competitive achievements, we observe that adversarial\nvulnerability varies across targets and certain vulnerabilities remain\nprevalent. Intriguingly, such peculiar phenomenon cannot be relieved even with\ndeeper architectures and advanced defense methods. To address this issue, in\nthis paper, we introduce a causal approach called Adversarial Double Machine\nLearning (ADML), which allows us to quantify the degree of adversarial\nvulnerability for network predictions and capture the effect of treatments on\noutcome of interests. ADML can directly estimate causal parameter of\nadversarial perturbations per se and mitigate negative effects that can\npotentially damage robustness, bridging a causal perspective into the\nadversarial vulnerability. Through extensive experiments on various CNN and\nTransformer architectures, we corroborate that ADML improves adversarial\nrobustness with large margins and relieve the empirical observation.\n","authors":["Byung-Kwan Lee","Junho Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2307.07250v2.pdf","comment":"Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09019v1","updated":"2023-07-18T07:15:26Z","published":"2023-07-18T07:15:26Z","title":"U-shaped Transformer: Retain High Frequency Context in Time Series\n Analysis","summary":" Time series prediction plays a crucial role in various industrial fields. In\nrecent years, neural networks with a transformer backbone have achieved\nremarkable success in many domains, including computer vision and NLP. In time\nseries analysis domain, some studies have suggested that even the simplest MLP\nnetworks outperform advanced transformer-based networks on time series forecast\ntasks. However, we believe these findings indicate there to be low-rank\nproperties in time series sequences. In this paper, we consider the low-pass\ncharacteristics of transformers and try to incorporate the advantages of MLP.\nWe adopt skip-layer connections inspired by Unet into traditional transformer\nbackbone, thus preserving high-frequency context from input to output, namely\nU-shaped Transformer. We introduce patch merge and split operation to extract\nfeatures with different scales and use larger datasets to fully make use of the\ntransformer backbone. Our experiments demonstrate that the model performs at an\nadvanced level across multiple datasets with relatively low cost.\n","authors":["Qingkui Chen","Yiqin Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09018v1","updated":"2023-07-18T07:12:46Z","published":"2023-07-18T07:12:46Z","title":"Multimodal LLMs for health grounded in individual-specific data","summary":" Foundation large language models (LLMs) have shown an impressive ability to\nsolve tasks across a wide range of fields including health. To effectively\nsolve personalized health tasks, LLMs need the ability to ingest a diversity of\ndata modalities that are relevant to an individual's health status. In this\npaper, we take a step towards creating multimodal LLMs for health that are\ngrounded in individual-specific data by developing a framework (HeLM: Health\nLarge Language Model for Multimodal Understanding) that enables LLMs to use\nhigh-dimensional clinical modalities to estimate underlying disease risk. HeLM\nencodes complex data modalities by learning an encoder that maps them into the\nLLM's token embedding space and for simple modalities like tabular data by\nserializing the data into text. Using data from the UK Biobank, we show that\nHeLM can effectively use demographic and clinical features in addition to\nhigh-dimensional time-series data to estimate disease risk. For example, HeLM\nachieves an AUROC of 0.75 for asthma prediction when combining tabular and\nspirogram data modalities compared with 0.49 when only using tabular data.\nOverall, we find that HeLM outperforms or performs at parity with classical\nmachine learning approaches across a selection of eight binary traits.\nFurthermore, we investigate the downstream uses of this model such as its\ngeneralizability to out-of-distribution traits and its ability to power\nconversations around individual health and wellness.\n","authors":["Anastasiya Belyaeva","Justin Cosentino","Farhad Hormozdiari","Cory Y. McLean","Nicholas A. Furlotte"],"pdf_url":"https://arxiv.org/pdf/2307.09018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09009v1","updated":"2023-07-18T06:56:08Z","published":"2023-07-18T06:56:08Z","title":"How is ChatGPT's behavior changing over time?","summary":" GPT-3.5 and GPT-4 are the two most widely used large language model (LLM)\nservices. However, when and how these models are updated over time is opaque.\nHere, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on\nfour diverse tasks: 1) solving math problems, 2) answering sensitive/dangerous\nquestions, 3) generating code and 4) visual reasoning. We find that the\nperformance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time.\nFor example, GPT-4 (March 2023) was very good at identifying prime numbers\n(accuracy 97.6%) but GPT-4 (June 2023) was very poor on these same questions\n(accuracy 2.4%). Interestingly GPT-3.5 (June 2023) was much better than GPT-3.5\n(March 2023) in this task. GPT-4 was less willing to answer sensitive questions\nin June than in March, and both GPT-4 and GPT-3.5 had more formatting mistakes\nin code generation in June than in March. Overall, our findings shows that the\nbehavior of the same LLM service can change substantially in a relatively short\namount of time, highlighting the need for continuous monitoring of LLM quality.\n","authors":["Lingjiao Chen","Matei Zaharia","James Zou"],"pdf_url":"https://arxiv.org/pdf/2307.09009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06825v2","updated":"2023-07-18T06:50:43Z","published":"2023-03-13T02:50:59Z","title":"Best-of-three-worlds Analysis for Linear Bandits with\n Follow-the-regularized-leader Algorithm","summary":" The linear bandit problem has been studied for many years in both stochastic\nand adversarial settings. Designing an algorithm that can optimize the\nenvironment without knowing the loss type attracts lots of interest.\n\\citet{LeeLWZ021} propose an algorithm that actively detects the loss type and\nthen switches between different algorithms specially designed for specific\nsettings. However, such an approach requires meticulous designs to perform well\nin all environments. Follow-the-regularized-leader (FTRL) is another type of\npopular algorithm that can adapt to different environments. This algorithm is\nof simple design and the regret bounds are shown to be optimal in traditional\nmulti-armed bandit problems compared with the detect-switch type. Designing an\nFTRL-type algorithm for linear bandits is an important question that has been\nopen for a long time. In this paper, we prove that the FTRL algorithm with a\nnegative entropy regularizer can achieve the best-of-three-world results for\nthe linear bandit problem. Our regret bounds achieve the same or nearly the\nsame order as the previous detect-switch type algorithm but with a much simpler\nalgorithmic design.\n","authors":["Fang Kong","Canzhe Zhao","Shuai Li"],"pdf_url":"https://arxiv.org/pdf/2303.06825v2.pdf","comment":"Accepted in COLT 2023"},{"id":"http://arxiv.org/abs/2307.09006v1","updated":"2023-07-18T06:48:39Z","published":"2023-07-18T06:48:39Z","title":"OxfordVGG Submission to the EGO4D AV Transcription Challenge","summary":" This report presents the technical details of our submission on the EGO4D\nAudio-Visual (AV) Automatic Speech Recognition Challenge 2023 from the\nOxfordVGG team. We present WhisperX, a system for efficient speech\ntranscription of long-form audio with word-level time alignment, along with two\ntext normalisers which are publicly available. Our final submission obtained\n56.0% of the Word Error Rate (WER) on the challenge test set, ranked 1st on the\nleaderboard. All baseline codes and models are available on\nhttps://github.com/m-bain/whisperX.\n","authors":["Jaesung Huh","Max Bain","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2307.09006v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.08999v1","updated":"2023-07-18T06:34:32Z","published":"2023-07-18T06:34:32Z","title":"Oracle Efficient Online Multicalibration and Omniprediction","summary":" A recent line of work has shown a surprising connection between\nmulticalibration, a multi-group fairness notion, and omniprediction, a learning\nparadigm that provides simultaneous loss minimization guarantees for a large\nfamily of loss functions. Prior work studies omniprediction in the batch\nsetting. We initiate the study of omniprediction in the online adversarial\nsetting. Although there exist algorithms for obtaining notions of\nmulticalibration in the online adversarial setting, unlike batch algorithms,\nthey work only for small finite classes of benchmark functions $F$, because\nthey require enumerating every function $f \\in F$ at every round. In contrast,\nomniprediction is most interesting for learning theoretic hypothesis classes\n$F$, which are generally continuously large.\n We develop a new online multicalibration algorithm that is well defined for\ninfinite benchmark classes $F$, and is oracle efficient (i.e. for any class\n$F$, the algorithm has the form of an efficient reduction to a no-regret\nlearning algorithm for $F$). The result is the first efficient online\nomnipredictor -- an oracle efficient prediction algorithm that can be used to\nsimultaneously obtain no regret guarantees to all Lipschitz convex loss\nfunctions. For the class $F$ of linear functions, we show how to make our\nalgorithm efficient in the worst case. Also, we show upper and lower bounds on\nthe extent to which our rates can be improved: our oracle efficient algorithm\nactually promises a stronger guarantee called swap-omniprediction, and we prove\na lower bound showing that obtaining $O(\\sqrt{T})$ bounds for\nswap-omniprediction is impossible in the online setting. On the other hand, we\ngive a (non-oracle efficient) algorithm which can obtain the optimal\n$O(\\sqrt{T})$ omniprediction bounds without going through multicalibration,\ngiving an information theoretic separation between these two solution concepts.\n","authors":["Sumegha Garg","Christopher Jung","Omer Reingold","Aaron Roth"],"pdf_url":"https://arxiv.org/pdf/2307.08999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08989v1","updated":"2023-07-18T06:01:37Z","published":"2023-07-18T06:01:37Z","title":"GraphCL-DTA: a graph contrastive learning with molecular semantics for\n drug-target binding affinity prediction","summary":" Drug-target binding affinity prediction plays an important role in the early\nstages of drug discovery, which can infer the strength of interactions between\nnew drugs and new targets. However, the performance of previous computational\nmodels is limited by the following drawbacks. The learning of drug\nrepresentation relies only on supervised data, without taking into account the\ninformation contained in the molecular graph itself. Moreover, most previous\nstudies tended to design complicated representation learning module, while\nuniformity, which is used to measure representation quality, is ignored. In\nthis study, we propose GraphCL-DTA, a graph contrastive learning with molecular\nsemantics for drug-target binding affinity prediction. In GraphCL-DTA, we\ndesign a graph contrastive learning framework for molecular graphs to learn\ndrug representations, so that the semantics of molecular graphs are preserved.\nThrough this graph contrastive framework, a more essential and effective drug\nrepresentation can be learned without additional supervised data. Next, we\ndesign a new loss function that can be directly used to smoothly adjust the\nuniformity of drug and target representations. By directly optimizing the\nuniformity of representations, the representation quality of drugs and targets\ncan be improved. The effectiveness of the above innovative elements is verified\non two real datasets, KIBA and Davis. The excellent performance of GraphCL-DTA\non the above datasets suggests its superiority to the state-of-the-art model.\n","authors":["Xinxing Yang","Genke Yang","Jian Chu"],"pdf_url":"https://arxiv.org/pdf/2307.08989v1.pdf","comment":"13 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.08982v1","updated":"2023-07-18T05:39:32Z","published":"2023-07-18T05:39:32Z","title":"Neural Network Pruning as Spectrum Preserving Process","summary":" Neural networks have achieved remarkable performance in various application\ndomains. Nevertheless, a large number of weights in pre-trained deep neural\nnetworks prohibit them from being deployed on smartphones and embedded systems.\nIt is highly desirable to obtain lightweight versions of neural networks for\ninference in edge devices. Many cost-effective approaches were proposed to\nprune dense and convolutional layers that are common in deep neural networks\nand dominant in the parameter space. However, a unified theoretical foundation\nfor the problem mostly is missing. In this paper, we identify the close\nconnection between matrix spectrum learning and neural network training for\ndense and convolutional layers and argue that weight pruning is essentially a\nmatrix sparsification process to preserve the spectrum. Based on the analysis,\nwe also propose a matrix sparsification algorithm tailored for neural network\npruning that yields better pruning result. We carefully design and conduct\nexperiments to support our arguments. Hence we provide a consolidated viewpoint\nfor neural network pruning and enhance the interpretability of deep neural\nnetworks by identifying and preserving the critical neural weights.\n","authors":["Shibo Yao","Dantong Yu","Ioannis Koutis"],"pdf_url":"https://arxiv.org/pdf/2307.08982v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2304.03452"},{"id":"http://arxiv.org/abs/2307.08970v1","updated":"2023-07-18T05:04:11Z","published":"2023-07-18T05:04:11Z","title":"A Unifying Framework for Differentially Private Sums under Continual\n Observation","summary":" We study the problem of maintaining a differentially private decaying sum\nunder continual observation. We give a unifying framework and an efficient\nalgorithm for this problem for \\emph{any sufficiently smooth} function. Our\nalgorithm is the first differentially private algorithm that does not have a\nmultiplicative error for polynomially-decaying weights. Our algorithm improves\non all prior works on differentially private decaying sums under continual\nobservation and recovers exactly the additive error for the special case of\ncontinual counting from Henzinger et al. (SODA 2023) as a corollary.\n Our algorithm is a variant of the factorization mechanism whose error depends\non the $\\gamma_2$ and $\\gamma_F$ norm of the underlying matrix. We give a\nconstructive proof for an almost exact upper bound on the $\\gamma_2$ and\n$\\gamma_F$ norm and an almost tight lower bound on the $\\gamma_2$ norm for a\nlarge class of lower-triangular matrices. This is the first non-trivial lower\nbound for lower-triangular matrices whose non-zero entries are not all the\nsame. It includes matrices for all continual decaying sums problems, resulting\nin an upper bound on the additive error of any differentially private decaying\nsums algorithm under continual observation.\n We also explore some implications of our result in discrepancy theory and\noperator algebra. Given the importance of the $\\gamma_2$ norm in computer\nscience and the extensive work in mathematics, we believe our result will have\nfurther applications.\n","authors":["Monika Henzinger","Jalaj Upadhyay","Sarvagya Upadhyay"],"pdf_url":"https://arxiv.org/pdf/2307.08970v1.pdf","comment":"32 pages"},{"id":"http://arxiv.org/abs/2306.16844v2","updated":"2023-07-18T04:35:03Z","published":"2023-06-29T10:34:23Z","title":"Macro Placement by Wire-Mask-Guided Black-Box Optimization","summary":" The development of very large-scale integration (VLSI) technology has posed\nnew challenges for electronic design automation (EDA) techniques in chip\nfloorplanning. During this process, macro placement is an important subproblem,\nwhich tries to determine the positions of all macros with the aim of minimizing\nhalf-perimeter wirelength (HPWL) and avoiding overlapping. Previous methods\ninclude packing-based, analytical and reinforcement learning methods. In this\npaper, we propose a new black-box optimization (BBO) framework (called\nWireMask-BBO) for macro placement, by using a wire-mask-guided greedy procedure\nfor objective evaluation. Equipped with different BBO algorithms, WireMask-BBO\nempirically achieves significant improvements over previous methods, i.e.,\nachieves significantly shorter HPWL by using much less time. Furthermore, it\ncan fine-tune existing placements by treating them as initial solutions, which\ncan bring up to 50% improvement in HPWL. WireMask-BBO has the potential to\nsignificantly improve the quality and efficiency of chip floorplanning, which\nmakes it appealing to researchers and practitioners in EDA and will also\npromote the application of BBO.\n","authors":["Yunqi Shi","Ke Xue","Lei Song","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2306.16844v2.pdf","comment":"Update Table1 number dislocation"},{"id":"http://arxiv.org/abs/2307.08964v1","updated":"2023-07-18T04:29:16Z","published":"2023-07-18T04:29:16Z","title":"Landscape Surrogate: Learning Decision Losses for Mathematical\n Optimization Under Partial Information","summary":" Recent works in learning-integrated optimization have shown promise in\nsettings where the optimization problem is only partially observed or where\ngeneral-purpose optimizers perform poorly without expert tuning. By learning an\noptimizer $\\mathbf{g}$ to tackle these challenging problems with $f$ as the\nobjective, the optimization process can be substantially accelerated by\nleveraging past experience. The optimizer can be trained with supervision from\nknown optimal solutions or implicitly by optimizing the compound function\n$f\\circ \\mathbf{g}$. The implicit approach may not require optimal solutions as\nlabels and is capable of handling problem uncertainty; however, it is slow to\ntrain and deploy due to frequent calls to optimizer $\\mathbf{g}$ during both\ntraining and testing. The training is further challenged by sparse gradients of\n$\\mathbf{g}$, especially for combinatorial solvers. To address these\nchallenges, we propose using a smooth and learnable Landscape Surrogate $M$ as\na replacement for $f\\circ \\mathbf{g}$. This surrogate, learnable by neural\nnetworks, can be computed faster than the solver $\\mathbf{g}$, provides dense\nand smooth gradients during training, can generalize to unseen optimization\nproblems, and is efficiently learned via alternating optimization. We test our\napproach on both synthetic problems, including shortest path and\nmultidimensional knapsack, and real-world problems such as portfolio\noptimization, achieving comparable or superior objective values compared to\nstate-of-the-art baselines while reducing the number of calls to $\\mathbf{g}$.\nNotably, our approach outperforms existing methods for computationally\nexpensive high-dimensional problems.\n","authors":["Arman Zharmagambetov","Brandon Amos","Aaron Ferber","Taoan Huang","Bistra Dilkina","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2307.08964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08962v1","updated":"2023-07-18T04:26:33Z","published":"2023-07-18T04:26:33Z","title":"REX: Rapid Exploration and eXploitation for AI Agents","summary":" In this paper, we propose an enhanced approach for Rapid Exploration and\neXploitation for AI Agents called REX. Existing AutoGPT-style techniques have\ninherent limitations, such as a heavy reliance on precise descriptions for\ndecision-making, and the lack of a systematic approach to leverage try-and-fail\nprocedures akin to traditional Reinforcement Learning (RL). REX introduces an\nadditional layer of rewards and integrates concepts similar to Upper Confidence\nBound (UCB) scores, leading to more robust and efficient AI agent performance.\nThis approach has the advantage of enabling the utilization of offline\nbehaviors from logs and allowing seamless integration with existing foundation\nmodels while it does not require any model fine-tuning. Through comparative\nanalysis with existing methods such as Chain-of-Thoughts(CoT) and Reasoning viA\nPlanning(RAP), REX-based methods demonstrate comparable performance and, in\ncertain cases, even surpass the results achieved by these existing techniques.\nNotably, REX-based methods exhibit remarkable reductions in execution time,\nenhancing their practical applicability across a diverse set of scenarios.\n","authors":["Rithesh Murthy","Shelby Heinecke","Juan Carlos Niebles","Zhiwei Liu","Le Xue","Weiran Yao","Yihao Feng","Zeyuan Chen","Akash Gokul","Devansh Arpit","Ran Xu","Phil Mui","Huan Wang","Caiming Xiong","Silvio Savarese"],"pdf_url":"https://arxiv.org/pdf/2307.08962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08955v1","updated":"2023-07-18T03:48:27Z","published":"2023-07-18T03:48:27Z","title":"Discretization-based ensemble model for robust learning in IoT","summary":" IoT device identification is the process of recognizing and verifying\nconnected IoT devices to the network. This is an essential process for ensuring\nthat only authorized devices can access the network, and it is necessary for\nnetwork management and maintenance. In recent years, machine learning models\nhave been used widely for automating the process of identifying devices in the\nnetwork. However, these models are vulnerable to adversarial attacks that can\ncompromise their accuracy and effectiveness. To better secure device\nidentification models, discretization techniques enable reduction in the\nsensitivity of machine learning models to adversarial attacks contributing to\nthe stability and reliability of the model. On the other hand, Ensemble methods\ncombine multiple heterogeneous models to reduce the impact of remaining noise\nor errors in the model. Therefore, in this paper, we integrate discretization\ntechniques and ensemble methods and examine it on model robustness against\nadversarial attacks. In other words, we propose a discretization-based ensemble\nstacking technique to improve the security of our ML models. We evaluate the\nperformance of different ML-based IoT device identification models against\nwhite box and black box attacks using a real-world dataset comprised of network\ntraffic from 28 IoT devices. We demonstrate that the proposed method enables\nrobustness to the models for IoT device identification.\n","authors":["Anahita Namvar","Chandra Thapa","Salil S. Kanhere"],"pdf_url":"https://arxiv.org/pdf/2307.08955v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2307.08951v1","updated":"2023-07-18T03:39:03Z","published":"2023-07-18T03:39:03Z","title":"Knowledge-infused Deep Learning Enables Interpretable Landslide\n Forecasting","summary":" Forecasting how landslides will evolve over time or whether they will fail is\na challenging task due to a variety of factors, both internal and external.\nDespite their considerable potential to address these challenges, deep learning\ntechniques lack interpretability, undermining the credibility of the forecasts\nthey produce. The recent development of transformer-based deep learning offers\nuntapped possibilities for forecasting landslides with unprecedented\ninterpretability and nonlinear feature learning capabilities. Here, we present\na deep learning pipeline that is capable of predicting landslide behavior\nholistically, which employs a transformer-based network called LFIT to learn\ncomplex nonlinear relationships from prior knowledge and multiple source data,\nidentifying the most relevant variables, and demonstrating a comprehensive\nunderstanding of landslide evolution and temporal patterns. By integrating\nprior knowledge, we provide improvement in holistic landslide forecasting,\nenabling us to capture diverse responses to various influencing factors in\ndifferent local landslide areas. Using deformation observations as proxies for\nmeasuring the kinetics of landslides, we validate our approach by training\nmodels to forecast reservoir landslides in the Three Gorges Reservoir and\ncreeping landslides on the Tibetan Plateau. When prior knowledge is\nincorporated, we show that interpretable landslide forecasting effectively\nidentifies influential factors across various landslides. It further elucidates\nhow local areas respond to these factors, making landslide behavior and trends\nmore interpretable and predictable. The findings from this study will\ncontribute to understanding landslide behavior in a new way and make the\nproposed approach applicable to other complex disasters influenced by internal\nand external factors in the future.\n","authors":["Zhengjing Ma","Gang Mei"],"pdf_url":"https://arxiv.org/pdf/2307.08951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08949v1","updated":"2023-07-18T03:34:33Z","published":"2023-07-18T03:34:33Z","title":"Alioth: A Machine Learning Based Interference-Aware Performance Monitor\n for Multi-Tenancy Applications in Public Cloud","summary":" Multi-tenancy in public clouds may lead to co-location interference on shared\nresources, which possibly results in performance degradation of cloud\napplications. Cloud providers want to know when such events happen and how\nserious the degradation is, to perform interference-aware migrations and\nalleviate the problem. However, virtual machines (VM) in\nInfrastructure-as-a-Service public clouds are black-boxes to providers, where\napplication-level performance information cannot be acquired. This makes\nperformance monitoring intensely challenging as cloud providers can only rely\non low-level metrics such as CPU usage and hardware counters.\n We propose a novel machine learning framework, Alioth, to monitor the\nperformance degradation of cloud applications. To feed the data-hungry models,\nwe first elaborate interference generators and conduct comprehensive\nco-location experiments on a testbed to build Alioth-dataset which reflects the\ncomplexity and dynamicity in real-world scenarios. Then we construct Alioth by\n(1) augmenting features via recovering low-level metrics under no interference\nusing denoising auto-encoders, (2) devising a transfer learning model based on\ndomain adaptation neural network to make models generalize on test cases unseen\nin offline training, and (3) developing a SHAP explainer to automate feature\nselection and enhance model interpretability. Experiments show that Alioth\nachieves an average mean absolute error of 5.29% offline and 10.8% when testing\non applications unseen in the training stage, outperforming the baseline\nmethods. Alioth is also robust in signaling quality-of-service violation under\ndynamicity. Finally, we demonstrate a possible application of Alioth's\ninterpretability, providing insights to benefit the decision-making of cloud\noperators. The dataset and code of Alioth have been released on GitHub.\n","authors":["Tianyao Shi","Yingxuan Yang","Yunlong Cheng","Xiaofeng Gao","Zhen Fang","Yongqiang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.08949v1.pdf","comment":"Accepted by 2023 IEEE International Parallel & Distributed Processing\n Symposium (IPDPS)"},{"id":"http://arxiv.org/abs/2307.08945v1","updated":"2023-07-18T03:28:03Z","published":"2023-07-18T03:28:03Z","title":"Mitigating Label Bias via Decoupled Confident Learning","summary":" Growing concerns regarding algorithmic fairness have led to a surge in\nmethodologies to mitigate algorithmic bias. However, such methodologies largely\nassume that observed labels in training data are correct. This is problematic\nbecause bias in labels is pervasive across important domains, including\nhealthcare, hiring, and content moderation. In particular, human-generated\nlabels are prone to encoding societal biases. While the presence of labeling\nbias has been discussed conceptually, there is a lack of methodologies to\naddress this problem. We propose a pruning method -- Decoupled Confident\nLearning (DeCoLe) -- specifically designed to mitigate label bias. After\nillustrating its performance on a synthetic dataset, we apply DeCoLe in the\ncontext of hate speech detection, where label bias has been recognized as an\nimportant challenge, and show that it successfully identifies biased labels and\noutperforms competing approaches.\n","authors":["Yunyi Li","Maria De-Arteaga","Maytal Saar-Tsechansky"],"pdf_url":"https://arxiv.org/pdf/2307.08945v1.pdf","comment":"AI & HCI Workshop at the 40th International Conference on Machine\n Learning (ICML), Honolulu, Hawaii, USA. 2023"},{"id":"http://arxiv.org/abs/2307.08944v1","updated":"2023-07-18T03:23:34Z","published":"2023-07-18T03:23:34Z","title":"Siamese Networks for Weakly Supervised Human Activity Recognition","summary":" Deep learning has been successfully applied to human activity recognition.\nHowever, training deep neural networks requires explicitly labeled data which\nis difficult to acquire. In this paper, we present a model with multiple\nsiamese networks that are trained by using only the information about the\nsimilarity between pairs of data samples without knowing the explicit labels.\nThe trained model maps the activity data samples into fixed size representation\nvectors such that the distance between the vectors in the representation space\napproximates the similarity of the data samples in the input space. Thus, the\ntrained model can work as a metric for a wide range of different clustering\nalgorithms. The training process minimizes a similarity loss function that\nforces the distance metric to be small for pairs of samples from the same kind\nof activity, and large for pairs of samples from different kinds of activities.\nWe evaluate the model on three datasets to verify its effectiveness in\nsegmentation and recognition of continuous human activity sequences.\n","authors":["Taoran Sheng","Manfred Huber"],"pdf_url":"https://arxiv.org/pdf/2307.08944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08941v1","updated":"2023-07-18T03:12:51Z","published":"2023-07-18T03:12:51Z","title":"NTK-approximating MLP Fusion for Efficient Language Model Fine-tuning","summary":" Fine-tuning a pre-trained language model (PLM) emerges as the predominant\nstrategy in many natural language processing applications. However, even\nfine-tuning the PLMs and doing inference are expensive, especially on edge\ndevices with low computing power. Some general approaches (e.g. quantization\nand distillation) have been widely studied to reduce the compute/memory of PLM\nfine-tuning, while very few one-shot compression techniques are explored. In\nthis paper, we investigate the neural tangent kernel (NTK)--which reveals the\ngradient descent dynamics of neural networks--of the multilayer perceptrons\n(MLP) modules in a PLM and propose to coin a lightweight PLM through\nNTK-approximating MLP fusion. To achieve this, we reconsider the MLP as a\nbundle of sub-MLPs, and cluster them into a given number of centroids, which\ncan then be restored as a compressed MLP and surprisingly shown to well\napproximate the NTK of the original PLM. Extensive experiments of PLM\nfine-tuning on both natural language understanding (NLU) and generation (NLG)\ntasks are provided to verify the effectiveness of the proposed method MLP\nfusion. Our code is available at https://github.com/weitianxin/MLP_Fusion.\n","authors":["Tianxin Wei","Zeming Guo","Yifan Chen","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2307.08941v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.08939v1","updated":"2023-07-18T03:12:03Z","published":"2023-07-18T03:12:03Z","title":"Experimental Security Analysis of DNN-based Adaptive Cruise Control\n under Context-Aware Perception Attacks","summary":" Adaptive Cruise Control (ACC) is a widely used driver assistance feature for\nmaintaining desired speed and safe distance to the leading vehicles. This paper\nevaluates the security of the deep neural network (DNN) based ACC systems under\nstealthy perception attacks that strategically inject perturbations into camera\ndata to cause forward collisions. We present a combined\nknowledge-and-data-driven approach to design a context-aware strategy for the\nselection of the most critical times for triggering the attacks and a novel\noptimization-based method for the adaptive generation of image perturbations at\nrun-time. We evaluate the effectiveness of the proposed attack using an actual\ndriving dataset and a realistic simulation platform with the control software\nfrom a production ACC system and a physical-world driving simulator while\nconsidering interventions by the driver and safety features such as Automatic\nEmergency Braking (AEB) and Forward Collision Warning (FCW). Experimental\nresults show that the proposed attack achieves 142.9x higher success rate in\ncausing accidents than random attacks and is mitigated 89.6% less by the safety\nfeatures while being stealthy and robust to real-world factors and dynamic\nchanges in the environment. This study provides insights into the role of human\noperators and basic safety interventions in preventing attacks.\n","authors":["Xugui Zhou","Anqi Chen","Maxfield Kouzel","Haotian Ren","Morgan McCarty","Cristina Nita-Rotaru","Homa Alemzadeh"],"pdf_url":"https://arxiv.org/pdf/2307.08939v1.pdf","comment":"18 pages, 14 figures, 8 tables"},{"id":"http://arxiv.org/abs/2303.13024v3","updated":"2023-07-18T03:06:42Z","published":"2023-03-23T04:16:00Z","title":"Identifying TBI Physiological States by Clustering Multivariate Clinical\n Time-Series Data","summary":" Determining clinically relevant physiological states from multivariate time\nseries data with missing values is essential for providing appropriate\ntreatment for acute conditions such as Traumatic Brain Injury (TBI),\nrespiratory failure, and heart failure. Utilizing non-temporal clustering or\ndata imputation and aggregation techniques may lead to loss of valuable\ninformation and biased analyses. In our study, we apply the SLAC-Time\nalgorithm, an innovative self-supervision-based approach that maintains data\nintegrity by avoiding imputation or aggregation, offering a more useful\nrepresentation of acute patient states. By using SLAC-Time to cluster data in a\nlarge research dataset, we identified three distinct TBI physiological states\nand their specific feature profiles. We employed various clustering evaluation\nmetrics and incorporated input from a clinical domain expert to validate and\ninterpret the identified physiological states. Further, we discovered how\nspecific clinical events and interventions can influence patient states and\nstate transitions.\n","authors":["Hamid Ghaderi","Brandon Foreman","Amin Nayebi","Sindhu Tipirneni","Chandan K. Reddy","Vignesh Subbian"],"pdf_url":"https://arxiv.org/pdf/2303.13024v3.pdf","comment":"10 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.08934v1","updated":"2023-07-18T02:47:32Z","published":"2023-07-18T02:47:32Z","title":"Multi-stage Neural Networks: Function Approximator of Machine Precision","summary":" Deep learning techniques are increasingly applied to scientific problems,\nwhere the precision of networks is crucial. Despite being deemed as universal\nfunction approximators, neural networks, in practice, struggle to reduce the\nprediction errors below $O(10^{-5})$ even with large network size and extended\ntraining iterations. To address this issue, we developed the multi-stage neural\nnetworks that divides the training process into different stages, with each\nstage using a new network that is optimized to fit the residue from the\nprevious stage. Across successive stages, the residue magnitudes decreases\nsubstantially and follows an inverse power-law relationship with the residue\nfrequencies. The multi-stage neural networks effectively mitigate the spectral\nbiases associated with regular neural networks, enabling them to capture the\nhigh frequency feature of target functions. We demonstrate that the prediction\nerror from the multi-stage training for both regression problems and\nphysics-informed neural networks can nearly reach the machine-precision\n$O(10^{-16})$ of double-floating point within a finite number of iterations.\nSuch levels of accuracy are rarely attainable using single neural networks\nalone.\n","authors":["Yongji Wang","Ching-Yao Lai"],"pdf_url":"https://arxiv.org/pdf/2307.08934v1.pdf","comment":"38 pages, 17 pages"},{"id":"http://arxiv.org/abs/2307.08933v1","updated":"2023-07-18T02:43:19Z","published":"2023-07-18T02:43:19Z","title":"IxDRL: A Novel Explainable Deep Reinforcement Learning Toolkit based on\n Analyses of Interestingness","summary":" In recent years, advances in deep learning have resulted in a plethora of\nsuccesses in the use of reinforcement learning (RL) to solve complex sequential\ndecision tasks with high-dimensional inputs. However, existing systems lack the\nnecessary mechanisms to provide humans with a holistic view of their\ncompetence, presenting an impediment to their adoption, particularly in\ncritical applications where the decisions an agent makes can have significant\nconsequences. Yet, existing RL-based systems are essentially competency-unaware\nin that they lack the necessary interpretation mechanisms to allow human\noperators to have an insightful, holistic view of their competency. Towards\nmore explainable Deep RL (xDRL), we propose a new framework based on analyses\nof interestingness. Our tool provides various measures of RL agent competence\nstemming from interestingness analysis and is applicable to a wide range of RL\nalgorithms, natively supporting the popular RLLib toolkit. We showcase the use\nof our framework by applying the proposed pipeline in a set of scenarios of\nvarying complexity. We empirically assess the capability of the approach in\nidentifying agent behavior patterns and competency-controlling conditions, and\nthe task elements mostly responsible for an agent's competence, based on global\nand local analyses of interestingness. Overall, we show that our framework can\nprovide agent designers with insights about RL agent competence, both their\ncapabilities and limitations, enabling more informed decisions about\ninterventions, additional training, and other interactions in collaborative\nhuman-machine settings.\n","authors":["Pedro Sequeira","Melinda Gervasio"],"pdf_url":"https://arxiv.org/pdf/2307.08933v1.pdf","comment":"To be published in the Proceedings of the 1st World Conference on\n eXplainable Artificial Intelligence (xAI 2023). arXiv admin note: substantial\n text overlap with arXiv:2211.06376"},{"id":"http://arxiv.org/abs/2307.08929v1","updated":"2023-07-18T02:28:48Z","published":"2023-07-18T02:28:48Z","title":"On-the-fly machine learning for parametrization of the effective\n Hamiltonian","summary":" The first-principles-based effective Hamiltonian is widely used to predict\nand simulate the properties of ferroelectrics and relaxor ferroelectrics.\nHowever, the parametrization method of the effective Hamiltonian is complicated\nand hardly can resolve the systems with complex interactions and/or complex\ncomponents. Here, we developed an on-the-fly machine learning approach to\nparametrize the effective Hamiltonian based on Bayesian linear regression. The\nparametrization is completed in molecular dynamics simulations, with the\nenergy, forces and stress predicted at each step along with their\nuncertainties. First-principles calculations are executed when the\nuncertainties are large to retrain the parameters. This approach provides a\nuniversal and automatic way to compute the effective Hamiltonian parameters for\nany considered systems including complex systems which previous methods can not\nhandle. BaTiO3 and Pb(Sc,Ta)O3 are taken as examples to show the accurateness\nof this approach comparing with conventional first-principles parametrization\nmethod.\n","authors":["Xingyue Ma","L. Bellaiche","Di Wu","Yurong Yang"],"pdf_url":"https://arxiv.org/pdf/2307.08929v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.07916v2","updated":"2023-07-18T02:26:30Z","published":"2023-07-16T01:45:00Z","title":"On the Robustness of Split Learning against Adversarial Attacks","summary":" Split learning enables collaborative deep learning model training while\npreserving data privacy and model security by avoiding direct sharing of raw\ndata and model details (i.e., sever and clients only hold partial sub-networks\nand exchange intermediate computations). However, existing research has mainly\nfocused on examining its reliability for privacy protection, with little\ninvestigation into model security. Specifically, by exploring full models,\nattackers can launch adversarial attacks, and split learning can mitigate this\nsevere threat by only disclosing part of models to untrusted servers.This paper\naims to evaluate the robustness of split learning against adversarial attacks,\nparticularly in the most challenging setting where untrusted servers only have\naccess to the intermediate layers of the model.Existing adversarial attacks\nmostly focus on the centralized setting instead of the collaborative setting,\nthus, to better evaluate the robustness of split learning, we develop a\ntailored attack called SPADV, which comprises two stages: 1) shadow model\ntraining that addresses the issue of lacking part of the model and 2) local\nadversarial attack that produces adversarial examples to evaluate.The first\nstage only requires a few unlabeled non-IID data, and, in the second stage,\nSPADV perturbs the intermediate output of natural samples to craft the\nadversarial ones. The overall cost of the proposed attack process is relatively\nlow, yet the empirical attack effectiveness is significantly high,\ndemonstrating the surprising vulnerability of split learning to adversarial\nattacks.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Wenmeng Zhou","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.07916v2.pdf","comment":"accepted by ECAI 2023, camera-ready version"},{"id":"http://arxiv.org/abs/2306.07528v2","updated":"2023-07-18T02:18:17Z","published":"2023-06-13T03:46:22Z","title":"Unified Off-Policy Learning to Rank: a Reinforcement Learning\n Perspective","summary":" Off-policy Learning to Rank (LTR) aims to optimize a ranker from data\ncollected by a deployed logging policy. However, existing off-policy learning\nto rank methods often make strong assumptions about how users generate the\nclick data, i.e., the click model, and hence need to tailor their methods\nspecifically under different click models. In this paper, we unified the\nranking process under general stochastic click models as a Markov Decision\nProcess (MDP), and the optimal ranking could be learned with offline\nreinforcement learning (RL) directly. Building upon this, we leverage offline\nRL techniques for off-policy LTR and propose the Click Model-Agnostic Unified\nOff-policy Learning to Rank (CUOLR) method, which could be easily applied to a\nwide range of click models. Through a dedicated formulation of the MDP, we show\nthat offline RL algorithms can adapt to various click models without complex\ndebiasing techniques and prior knowledge of the model. Results on various\nlarge-scale datasets demonstrate that CUOLR consistently outperforms the\nstate-of-the-art off-policy learning to rank algorithms while maintaining\nconsistency and robustness under different click models.\n","authors":["Zeyu Zhang","Yi Su","Hui Yuan","Yiran Wu","Rishab Balasubramanian","Qingyun Wu","Huazheng Wang","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2306.07528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08925v1","updated":"2023-07-18T02:09:14Z","published":"2023-07-18T02:09:14Z","title":"Federated Large Language Model: A Position Paper","summary":" Large scale language models (LLM) have received significant attention and\nfound diverse applications across various domains, but their development\nencounters challenges in real-world scenarios. These challenges arise due to\nthe scarcity of public domain data availability and the need to maintain\nprivacy with respect to private domain data. To address these issues, federated\nlearning (FL) has emerged as a promising technology that enables collaborative\ntraining of shared models while preserving decentralized data. We propose the\nconcept of federated LLM, which comprises three key components, i.e., federated\nLLM pre-training, federated LLM fine-tuning, and federated LLM prompt\nengineering. For each component, we discuss its advantage over traditional LLM\ntraining methods and propose specific engineering strategies for\nimplementation. Furthermore, we explore the novel challenges introduced by the\nintegration of FL and LLM. We analyze existing solutions and identify potential\nobstacles faced by these solutions within the context of federated LLM.\n","authors":["Chaochao Chen","Xiaohua Feng","Jun Zhou","Jianwei Yin","Xiaolin Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08925v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.08924v1","updated":"2023-07-18T01:53:18Z","published":"2023-07-18T01:53:18Z","title":"Learning to Sample Tasks for Meta Learning","summary":" Through experiments on various meta-learning methods, task samplers, and\nfew-shot learning tasks, this paper arrives at three conclusions. Firstly,\nthere are no universal task sampling strategies to guarantee the performance of\nmeta-learning models. Secondly, task diversity can cause the models to either\nunderfit or overfit during training. Lastly, the generalization performance of\nthe models are influenced by task divergence, task entropy, and task\ndifficulty. In response to these findings, we propose a novel task sampler\ncalled Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes\ntask divergence, task entropy, and task difficulty to sample tasks. To optimize\nASr, we rethink and propose a simple and general meta-learning algorithm.\nFinally, a large number of empirical experiments demonstrate the effectiveness\nof the proposed ASr.\n","authors":["Jingyao Wang","Zeen Song","Xingzhe Su","Lingyu Si","Hongwei Dong","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08924v1.pdf","comment":"10 pages, 7 tables, 3 figures"},{"id":"http://arxiv.org/abs/2307.08921v1","updated":"2023-07-18T01:37:57Z","published":"2023-07-18T01:37:57Z","title":"Optimistic Estimate Uncovers the Potential of Nonlinear Models","summary":" We propose an optimistic estimate to evaluate the best possible fitting\nperformance of nonlinear models. It yields an optimistic sample size that\nquantifies the smallest possible sample size to fit/recover a target function\nusing a nonlinear model. We estimate the optimistic sample sizes for matrix\nfactorization models, deep models, and deep neural networks (DNNs) with\nfully-connected or convolutional architecture. For each nonlinear model, our\nestimates predict a specific subset of targets that can be fitted at\noverparameterization, which are confirmed by our experiments. Our optimistic\nestimate reveals two special properties of the DNN models -- free\nexpressiveness in width and costly expressiveness in connection. These\nproperties suggest the following architecture design principles of DNNs: (i)\nfeel free to add neurons/kernels; (ii) restrain from connecting neurons.\nOverall, our optimistic estimate theoretically unveils the vast potential of\nnonlinear models in fitting at overparameterization. Based on this framework,\nwe anticipate gaining a deeper understanding of how and why numerous nonlinear\nmodels such as DNNs can effectively realize their potential in practice in the\nnear future.\n","authors":["Yaoyu Zhang","Zhongwang Zhang","Leyang Zhang","Zhiwei Bai","Tao Luo","Zhi-Qin John Xu"],"pdf_url":"https://arxiv.org/pdf/2307.08921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08920v1","updated":"2023-07-18T01:36:43Z","published":"2023-07-18T01:36:43Z","title":"Continuous-Time Reinforcement Learning: New Design Algorithms with\n Theoretical Insights and Performance Guarantees","summary":" Continuous-time nonlinear optimal control problems hold great promise in\nreal-world applications. After decades of development, reinforcement learning\n(RL) has achieved some of the greatest successes as a general nonlinear control\ndesign method. However, a recent comprehensive analysis of state-of-the-art\ncontinuous-time RL (CT-RL) methods, namely, adaptive dynamic programming\n(ADP)-based CT-RL algorithms, reveals they face significant design challenges\ndue to their complexity, numerical conditioning, and dimensional scaling\nissues. Despite advanced theoretical results, existing ADP CT-RL synthesis\nmethods are inadequate in solving even small, academic problems. The goal of\nthis work is thus to introduce a suite of new CT-RL algorithms for control of\naffine nonlinear systems. Our design approach relies on two important factors.\nFirst, our methods are applicable to physical systems that can be partitioned\ninto smaller subproblems. This constructive consideration results in reduced\ndimensionality and greatly improved intuitiveness of design. Second, we\nintroduce a new excitation framework to improve persistence of excitation (PE)\nand numerical conditioning performance via classical input/output insights.\nSuch a design-centric approach is the first of its kind in the ADP CT-RL\ncommunity. In this paper, we progressively introduce a suite of (decentralized)\nexcitable integral reinforcement learning (EIRL) algorithms. We provide\nconvergence and closed-loop stability guarantees, and we demonstrate these\nguarantees on a significant application problem of controlling an unstable,\nnonminimum phase hypersonic vehicle (HSV).\n","authors":["Brent A. Wallace","Jennie Si"],"pdf_url":"https://arxiv.org/pdf/2307.08920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.02877v2","updated":"2023-07-18T01:34:22Z","published":"2023-01-07T15:39:48Z","title":"Deep Learning for Mean Field Games with non-separable Hamiltonians","summary":" This paper introduces a new method based on Deep Galerkin Methods (DGMs) for\nsolving high-dimensional stochastic Mean Field Games (MFGs). We achieve this by\nusing two neural networks to approximate the unknown solutions of the MFG\nsystem and forward-backward conditions. Our method is efficient, even with a\nsmall number of iterations, and is capable of handling up to 300 dimensions\nwith a single layer, which makes it faster than other approaches. In contrast,\nmethods based on Generative Adversarial Networks (GANs) cannot solve MFGs with\nnon-separable Hamiltonians. We demonstrate the effectiveness of our approach by\napplying it to a traffic flow problem, which was previously solved using the\nNewton iteration method only in the deterministic case. We compare the results\nof our method to analytical solutions and previous approaches, showing its\nefficiency. We also prove the convergence of our neural network approximation\nwith a single hidden layer using the universal approximation theorem.\n","authors":["Mouhcine Assouli","Badr Missaoui"],"pdf_url":"https://arxiv.org/pdf/2301.02877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08919v1","updated":"2023-07-18T01:31:47Z","published":"2023-07-18T01:31:47Z","title":"Accuracy versus time frontiers of semi-supervised and self-supervised\n learning on medical images","summary":" For many applications of classifiers to medical images, a trustworthy label\nfor each image can be difficult or expensive to obtain. In contrast, images\nwithout labels are more readily available. Two major research directions both\npromise that additional unlabeled data can improve classifier performance:\nself-supervised learning pretrains useful representations on unlabeled data\nonly, then fine-tunes a classifier on these representations via the labeled\nset; semi-supervised learning directly trains a classifier on labeled and\nunlabeled data simultaneously. Recent methods from both directions have claimed\nsignificant gains on non-medical tasks, but do not systematically assess\nmedical images and mostly compare only to methods in the same direction. This\nstudy contributes a carefully-designed benchmark to help answer a\npractitioner's key question: given a small labeled dataset and a limited budget\nof hours to spend on training, what gains from additional unlabeled images are\npossible and which methods best achieve them? Unlike previous benchmarks, ours\nuses realistic-sized validation sets to select hyperparameters, assesses\nruntime-performance tradeoffs, and bridges two research fields. By comparing 6\nsemi-supervised methods and 5 self-supervised methods to strong labeled-only\nbaselines on 3 medical datasets with 30-1000 labels per class, we offer\ninsights to resource-constrained, results-focused practitioners: MixMatch,\nSimCLR, and BYOL represent strong choices that were not surpassed by more\nrecent methods. After much effort selecting hyperparameters on one dataset, we\npublish settings that enable strong methods to perform well on new medical\ntasks within a few hours, with further search over dozens of hours delivering\nmodest additional gains.\n","authors":["Zhe Huang","Ruijie Jiang","Shuchin Aeron","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2307.08919v1.pdf","comment":"Semi-supervised Learning; Self-supervised Learning; Medical Imaging"},{"id":"http://arxiv.org/abs/2307.08686v2","updated":"2023-07-18T01:20:01Z","published":"2023-07-17T17:47:50Z","title":"An R package for parametric estimation of causal effects","summary":" This article explains the usage of R package CausalModels, which is publicly\navailable on the Comprehensive R Archive Network. While packages are available\nfor sufficiently estimating causal effects, there lacks a package that provides\na collection of structural models using the conventional statistical approach\ndeveloped by Hernan and Robins (2020). CausalModels addresses this deficiency\nof software in R concerning causal inference by offering tools for methods that\naccount for biases in observational data without requiring extensive\nstatistical knowledge. These methods should not be ignored and may be more\nappropriate or efficient in solving particular problems. While implementations\nof these statistical models are distributed among a number of causal packages,\nCausalModels introduces a simple and accessible framework for a consistent\nmodeling pipeline among a variety of statistical methods for estimating causal\neffects in a single R package. It consists of common methods including\nstandardization, IP weighting, G-estimation, outcome regression, instrumental\nvariables and propensity matching.\n","authors":["Joshua Wolff Anderson","Cyril Rakovski"],"pdf_url":"https://arxiv.org/pdf/2307.08686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08913v1","updated":"2023-07-18T01:16:23Z","published":"2023-07-18T01:16:23Z","title":"Towards the Sparseness of Projection Head in Self-Supervised Learning","summary":" In recent years, self-supervised learning (SSL) has emerged as a promising\napproach for extracting valuable representations from unlabeled data. One\nsuccessful SSL method is contrastive learning, which aims to bring positive\nexamples closer while pushing negative examples apart. Many current contrastive\nlearning approaches utilize a parameterized projection head. Through a\ncombination of empirical analysis and theoretical investigation, we provide\ninsights into the internal mechanisms of the projection head and its\nrelationship with the phenomenon of dimensional collapse. Our findings\ndemonstrate that the projection head enhances the quality of representations by\nperforming contrastive loss in a projected subspace. Therefore, we propose an\nassumption that only a subset of features is necessary when minimizing the\ncontrastive loss of a mini-batch of data. Theoretical analysis further suggests\nthat a sparse projection head can enhance generalization, leading us to\nintroduce SparseHead - a regularization term that effectively constrains the\nsparsity of the projection head, and can be seamlessly integrated with any\nself-supervised learning (SSL) approaches. Our experimental results validate\nthe effectiveness of SparseHead, demonstrating its ability to improve the\nperformance of existing contrastive methods.\n","authors":["Zeen Song","Xingzhe Su","Jingyao Wang","Wenwen Qiang","Changwen Zheng","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2307.08913v1.pdf","comment":"9 pages,3 figures"},{"id":"http://arxiv.org/abs/2307.07674v2","updated":"2023-07-18T01:11:01Z","published":"2023-07-15T01:17:14Z","title":"An Empirical Study of the Effectiveness of Using a Replay Buffer on Mode\n Discovery in GFlowNets","summary":" Reinforcement Learning (RL) algorithms aim to learn an optimal policy by\niteratively sampling actions to learn how to maximize the total expected\nreturn, $R(x)$. GFlowNets are a special class of algorithms designed to\ngenerate diverse candidates, $x$, from a discrete set, by learning a policy\nthat approximates the proportional sampling of $R(x)$. GFlowNets exhibit\nimproved mode discovery compared to conventional RL algorithms, which is very\nuseful for applications such as drug discovery and combinatorial search.\nHowever, since GFlowNets are a relatively recent class of algorithms, many\ntechniques which are useful in RL have not yet been associated with them. In\nthis paper, we study the utilization of a replay buffer for GFlowNets. We\nexplore empirically various replay buffer sampling techniques and assess the\nimpact on the speed of mode discovery and the quality of the modes discovered.\nOur experimental results in the Hypergrid toy domain and a molecule synthesis\nenvironment demonstrate significant improvements in mode discovery when\ntraining with a replay buffer, compared to training only with trajectories\ngenerated on-policy.\n","authors":["Nikhil Vemgal","Elaine Lau","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2307.07674v2.pdf","comment":"Accepted to ICML 2023 workshop on Structured Probabilistic Inference\n & Generative Modeling"},{"id":"http://arxiv.org/abs/2307.08910v1","updated":"2023-07-18T01:02:20Z","published":"2023-07-18T01:02:20Z","title":"Sharpness-Aware Graph Collaborative Filtering","summary":" Graph Neural Networks (GNNs) have achieved impressive performance in\ncollaborative filtering. However, GNNs tend to yield inferior performance when\nthe distributions of training and test data are not aligned well. Also,\ntraining GNNs requires optimizing non-convex neural networks with an abundance\nof local and global minima, which may differ widely in their performance at\ntest time. Thus, it is essential to choose the minima carefully. Here we\npropose an effective training schema, called {gSAM}, under the principle that\nthe \\textit{flatter} minima has a better generalization ability than the\n\\textit{sharper} ones. To achieve this goal, gSAM regularizes the flatness of\nthe weight loss landscape by forming a bi-level optimization: the outer problem\nconducts the standard model training while the inner problem helps the model\njump out of the sharp minima. Experimental results show the superiority of our\ngSAM.\n","authors":["Huiyuan Chen","Chin-Chia Michael Yeh","Yujie Fan","Yan Zheng","Junpeng Wang","Vivian Lai","Mahashweta Das","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2307.08910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09920v2","updated":"2023-07-18T00:39:06Z","published":"2022-07-19T01:25:31Z","title":"DESCN: Deep Entire Space Cross Networks for Individual Treatment Effect\n Estimation","summary":" Causal Inference has wide applications in various areas such as E-commerce\nand precision medicine, and its performance heavily relies on the accurate\nestimation of the Individual Treatment Effect (ITE). Conventionally, ITE is\npredicted by modeling the treated and control response functions separately in\ntheir individual sample spaces. However, such an approach usually encounters\ntwo issues in practice, i.e. divergent distribution between treated and control\ngroups due to treatment bias, and significant sample imbalance of their\npopulation sizes. This paper proposes Deep Entire Space Cross Networks (DESCN)\nto model treatment effects from an end-to-end perspective. DESCN captures the\nintegrated information of the treatment propensity, the response, and the\nhidden treatment effect through a cross network in a multi-task learning\nmanner. Our method jointly learns the treatment and response functions in the\nentire sample space to avoid treatment bias and employs an intermediate pseudo\ntreatment effect prediction network to relieve sample imbalance. Extensive\nexperiments are conducted on a synthetic dataset and a large-scaled production\ndataset from the E-commerce voucher distribution business. The results indicate\nthat DESCN can successfully enhance the accuracy of ITE estimation and improve\nthe uplift ranking performance. A sample of the production dataset and the\nsource code are released to facilitate future research in the community, which\nis, to the best of our knowledge, the first large-scale public biased treatment\ndataset for causal inference.\n","authors":["Kailiang Zhong","Fengtong Xiao","Yan Ren","Yaorong Liang","Wenqing Yao","Xiaofeng Yang","Ling Cen"],"pdf_url":"https://arxiv.org/pdf/2207.09920v2.pdf","comment":"Accepted by SIGKDD 2022 Applied Data Science Track"},{"id":"http://arxiv.org/abs/2210.16299v2","updated":"2023-07-18T00:26:30Z","published":"2022-10-28T17:52:18Z","title":"Nonuniqueness and Convergence to Equivalent Solutions in Observer-based\n Inverse Reinforcement Learning","summary":" A key challenge in solving the deterministic inverse reinforcement learning\n(IRL) problem online and in real-time is the existence of multiple solutions.\nNonuniqueness necessitates the study of the notion of equivalent solutions,\ni.e., solutions that result in a different cost functional but same feedback\nmatrix, and convergence to such solutions. While offline algorithms that result\nin convergence to equivalent solutions have been developed in the literature,\nonline, real-time techniques that address nonuniqueness are not available. In\nthis paper, a regularized history stack observer that converges to\napproximately equivalent solutions of the IRL problem is developed. Novel\ndata-richness conditions are developed to facilitate the analysis and\nsimulation results are provided to demonstrate the effectiveness of the\ndeveloped technique.\n","authors":["Jared Town","Zachary Morrison","Rushikesh Kamalapurkar"],"pdf_url":"https://arxiv.org/pdf/2210.16299v2.pdf","comment":"16 pages, 7 figures, submitted to American Controls Conference 2023"},{"id":"http://arxiv.org/abs/2306.08617v2","updated":"2023-07-18T23:59:39Z","published":"2023-06-14T16:23:42Z","title":"Multi-class Graph Clustering via Approximated Effective $p$-Resistance","summary":" This paper develops an approximation to the (effective) $p$-resistance and\napplies it to multi-class clustering. Spectral methods based on the graph\nLaplacian and its generalization to the graph $p$-Laplacian have been a\nbackbone of non-euclidean clustering techniques. The advantage of the\n$p$-Laplacian is that the parameter $p$ induces a controllable bias on cluster\nstructure. The drawback of $p$-Laplacian eigenvector based methods is that the\nthird and higher eigenvectors are difficult to compute. Thus, instead, we are\nmotivated to use the $p$-resistance induced by the $p$-Laplacian for\nclustering. For $p$-resistance, small $p$ biases towards clusters with high\ninternal connectivity while large $p$ biases towards clusters of small\n\"extent,\" that is a preference for smaller shortest-path distances between\nvertices in the cluster. However, the $p$-resistance is expensive to compute.\nWe overcome this by developing an approximation to the $p$-resistance. We prove\nupper and lower bounds on this approximation and observe that it is exact when\nthe graph is a tree. We also provide theoretical justification for the use of\n$p$-resistance for clustering. Finally, we provide experiments comparing our\napproximated $p$-resistance clustering to other $p$-Laplacian based methods.\n","authors":["Shota Saito","Mark Herbster"],"pdf_url":"https://arxiv.org/pdf/2306.08617v2.pdf","comment":"Accepted to ICML2023"},{"id":"http://arxiv.org/abs/1912.13122v5","updated":"2023-07-18T23:53:12Z","published":"2019-12-31T00:10:50Z","title":"Declarative Mechanism Design","summary":" Regulation of Multi-Agent Systems (MAS) and Declarative Electronic\nInstitutions (DEIs) was a multidisciplinary research topic of the past decade\ninvolving (Physical and Software) Agents and Law since the beginning, but\nrecently evolved towards News-claimed Robot Lawyer since 2016. One of these\nfirst proposals of restricting the behaviour of Software Agentswas Electronic\nInstitutions.However, with the recent reformulation of Artificial Neural\nNetworks (ANNs) as Deep Learning (DL), Security, Privacy,Ethical and Legal\nissues regarding the use of DL has raised concerns in the Artificial\nIntelligence (AI) Community. Now that the Regulation of MAS is almost correctly\naddressed, we propose the Regulation of Artificial Neural Networks as\nAgent-based Training of a special type of regulated Artificial Neural Network\nthat we call Institutional Neural Network (INN).The main purpose of this paper\nis to bring attention to Artificial Teaching (AT) and to give a tentative\nanswer showing a proof-of-concept implementation of Regulated Deep Learning\n(RDL). This paper introduces the former concept and provide sI, a language\npreviously used to model declaratively and extend Electronic Institutions, as a\nmeans to regulate the execution of Artificial Neural Networks and their\ninteractions with Artificial Teachers (ATs)\n","authors":["Andrés García-Camino"],"pdf_url":"https://arxiv.org/pdf/1912.13122v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02479v2","updated":"2023-07-18T23:01:46Z","published":"2023-06-04T20:55:13Z","title":"Contagion Effect Estimation Using Proximal Embeddings","summary":" Contagion effect refers to the causal effect of peers' behavior on the\noutcome of an individual in social networks. While prominent methods for\nestimating contagion effects in observational studies often assume that there\nare no unmeasured confounders, contagion can be confounded due to latent\nhomophily: nodes in a homophilous network tend to have ties to peers with\nsimilar attributes and can behave similarly without influencing one another.\nOne way to account for latent homophily is by considering proxies for the\nunobserved confounders. However, in the presence of high-dimensional proxies,\nproxy-based methods can lead to substantially biased estimation of contagion\neffects, as we demonstrate in this paper. To tackle this issue, we introduce\nthe novel Proximal Embeddings (ProEmb), a framework which integrates\nVariational Autoencoders (VAEs) and adversarial networks to generate balanced\nlow-dimensional representations of high-dimensional proxies for different\ntreatment groups and identifies contagion effects in the presence of unobserved\nnetwork confounders. We empirically show that our method significantly\nincreases the accuracy of contagion effect estimation in observational network\ndata compared to state-of-the-art methods.\n","authors":["Zahra Fatemi","Elena Zheleva"],"pdf_url":"https://arxiv.org/pdf/2306.02479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09672v1","updated":"2023-07-18T22:54:51Z","published":"2023-07-18T22:54:51Z","title":"Convex Geometry of ReLU-layers, Injectivity on the Ball and Local\n Reconstruction","summary":" The paper uses a frame-theoretic setting to study the injectivity of a\nReLU-layer on the closed ball of $\\mathbb{R}^n$ and its non-negative part. In\nparticular, the interplay between the radius of the ball and the bias vector is\nemphasized. Together with a perspective from convex geometry, this leads to a\ncomputationally feasible method of verifying the injectivity of a ReLU-layer\nunder reasonable restrictions in terms of an upper bound of the bias vector.\nExplicit reconstruction formulas are provided, inspired by the duality concept\nfrom frame theory. All this gives rise to the possibility of quantifying the\ninvertibility of a ReLU-layer and a concrete reconstruction algorithm for any\ninput vector on the ball.\n","authors":["Daniel Haider","Martin Ehler","Peter Balazs"],"pdf_url":"https://arxiv.org/pdf/2307.09672v1.pdf","comment":"10 pages main paper + 2 pages appendix, 4 figures, 2 algorithms,\n conference"},{"id":"http://arxiv.org/abs/2307.09670v1","updated":"2023-07-18T22:48:54Z","published":"2023-07-18T22:48:54Z","title":"JAZZVAR: A Dataset of Variations found within Solo Piano Performances of\n Jazz Standards for Music Overpainting","summary":" Jazz pianists often uniquely interpret jazz standards. Passages from these\ninterpretations can be viewed as sections of variation. We manually extracted\nsuch variations from solo jazz piano performances. The JAZZVAR dataset is a\ncollection of 502 pairs of Variation and Original MIDI segments. Each Variation\nin the dataset is accompanied by a corresponding Original segment containing\nthe melody and chords from the original jazz standard. Our approach differs\nfrom many existing jazz datasets in the music information retrieval (MIR)\ncommunity, which often focus on improvisation sections within jazz\nperformances. In this paper, we outline the curation process for obtaining and\nsorting the repertoire, the pipeline for creating the Original and Variation\npairs, and our analysis of the dataset. We also introduce a new generative\nmusic task, Music Overpainting, and present a baseline Transformer model\ntrained on the JAZZVAR dataset for this task. Other potential applications of\nour dataset include expressive performance analysis and performer\nidentification.\n","authors":["Eleanor Row","Jingjing Tang","George Fazekas"],"pdf_url":"https://arxiv.org/pdf/2307.09670v1.pdf","comment":"Pre-print accepted for publication at CMMR2023, 12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2302.14101v2","updated":"2023-07-18T22:38:04Z","published":"2023-02-27T19:26:13Z","title":"Robust Field-level Likelihood-free Inference with Galaxies","summary":" We train graph neural networks to perform field-level likelihood-free\ninference using galaxy catalogs from state-of-the-art hydrodynamic simulations\nof the CAMELS project. Our models are rotational, translational, and\npermutation invariant and do not impose any cut on scale. From galaxy catalogs\nthat only contain $3$D positions and radial velocities of $\\sim 1, 000$\ngalaxies in tiny $(25~h^{-1}{\\rm Mpc})^3$ volumes our models can infer the\nvalue of $\\Omega_{\\rm m}$ with approximately $12$ % precision. More\nimportantly, by testing the models on galaxy catalogs from thousands of\nhydrodynamic simulations, each having a different efficiency of supernova and\nAGN feedback, run with five different codes and subgrid models - IllustrisTNG,\nSIMBA, Astrid, Magneticum, SWIFT-EAGLE -, we find that our models are robust to\nchanges in astrophysics, subgrid physics, and subhalo/galaxy finder.\nFurthermore, we test our models on $1,024$ simulations that cover a vast region\nin parameter space - variations in $5$ cosmological and $23$ astrophysical\nparameters - finding that the model extrapolates really well. Our results\nindicate that the key to building a robust model is the use of both galaxy\npositions and velocities, suggesting that the network have likely learned an\nunderlying physical relation that does not depend on galaxy formation and is\nvalid on scales larger than $\\sim10~h^{-1}{\\rm kpc}$.\n","authors":["Natalí S. M. de Santi","Helen Shao","Francisco Villaescusa-Navarro","L. Raul Abramo","Romain Teyssier","Pablo Villanueva-Domingo","Yueying Ni","Daniel Anglés-Alcázar","Shy Genel","Elena Hernandez-Martinez","Ulrich P. Steinwandel","Christopher C. Lovell","Klaus Dolag","Tiago Castro","Mark Vogelsberger"],"pdf_url":"https://arxiv.org/pdf/2302.14101v2.pdf","comment":"34 pages, 12 figures. For a video summarizing the results, see\n https://youtu.be/b59ep7cyPOs"},{"id":"http://arxiv.org/abs/2307.09668v1","updated":"2023-07-18T22:37:30Z","published":"2023-07-18T22:37:30Z","title":"Towards A Unified Agent with Foundation Models","summary":" Language Models and Vision Language Models have recently demonstrated\nunprecedented capabilities in terms of understanding human intentions,\nreasoning, scene understanding, and planning-like behaviour, in text form,\namong many others. In this work, we investigate how to embed and leverage such\nabilities in Reinforcement Learning (RL) agents. We design a framework that\nuses language as the core reasoning tool, exploring how this enables an agent\nto tackle a series of fundamental RL challenges, such as efficient exploration,\nreusing experience data, scheduling skills, and learning from observations,\nwhich traditionally require separate, vertically designed algorithms. We test\nour method on a sparse-reward simulated robotic manipulation environment, where\na robot needs to stack a set of objects. We demonstrate substantial performance\nimprovements over baselines in exploration efficiency and ability to reuse data\nfrom offline datasets, and illustrate how to reuse learned skills to solve\nnovel tasks or imitate videos of human experts.\n","authors":["Norman Di Palo","Arunkumar Byravan","Leonard Hasenclever","Markus Wulfmeier","Nicolas Heess","Martin Riedmiller"],"pdf_url":"https://arxiv.org/pdf/2307.09668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09665v1","updated":"2023-07-18T22:17:07Z","published":"2023-07-18T22:17:07Z","title":"Anticipating Technical Expertise and Capability Evolution in Research\n Communities using Dynamic Graph Transformers","summary":" The ability to anticipate technical expertise and capability evolution trends\nglobally is essential for national and global security, especially in\nsafety-critical domains like nuclear nonproliferation (NN) and rapidly emerging\nfields like artificial intelligence (AI). In this work, we extend traditional\nstatistical relational learning approaches (e.g., link prediction in\ncollaboration networks) and formulate a problem of anticipating technical\nexpertise and capability evolution using dynamic heterogeneous graph\nrepresentations. We develop novel capabilities to forecast collaboration\npatterns, authorship behavior, and technical capability evolution at different\ngranularities (e.g., scientist and institution levels) in two distinct research\nfields. We implement a dynamic graph transformer (DGT) neural architecture,\nwhich pushes the state-of-the-art graph neural network models by (a)\nforecasting heterogeneous (rather than homogeneous) nodes and edges, and (b)\nrelying on both discrete -- and continuous -- time inputs. We demonstrate that\nour DGT models predict collaboration, partnership, and expertise patterns with\n0.26, 0.73, and 0.53 mean reciprocal rank values for AI and 0.48, 0.93, and\n0.22 for NN domains. DGT model performance exceeds the best-performing static\ngraph baseline models by 30-80% across AI and NN domains. Our findings\ndemonstrate that DGT models boost inductive task performance, when previously\nunseen nodes appear in the test data, for the domains with emerging\ncollaboration patterns (e.g., AI). Specifically, models accurately predict\nwhich established scientists will collaborate with early career scientists and\nvice-versa in the AI domain.\n","authors":["Sameera Horawalavithana","Ellyn Ayton","Anastasiya Usenko","Robin Cosbey","Svitlana Volkova"],"pdf_url":"https://arxiv.org/pdf/2307.09665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09661v1","updated":"2023-07-18T22:03:43Z","published":"2023-07-18T22:03:43Z","title":"Physics-based Reduced Order Modeling for Uncertainty Quantification of\n Guided Wave Propagation using Bayesian Optimization","summary":" In the context of digital twins, structural health monitoring (SHM)\nconstitutes the backbone of condition-based maintenance, facilitating the\ninterconnection between virtual and physical assets. Guided wave propagation\n(GWP) is commonly employed for the inspection of structures in SHM. However,\nGWP is sensitive to variations in the material properties of the structure,\nleading to false alarms. In this direction, uncertainty quantification (UQ) is\nregularly applied to improve the reliability of predictions. Computational\nmechanics is a useful tool for the simulation of GWP, and is often applied for\nUQ. Even so, the application of UQ methods requires numerous simulations, while\nlarge-scale, transient numerical GWP solutions increase the computational cost.\nReduced order models (ROMs) are commonly employed to provide numerical results\nin a limited amount of time. In this paper, we propose a machine learning\n(ML)-based ROM, mentioned as BO-ML-ROM, to decrease the computational time\nrelated to the simulation of the GWP. The ROM is integrated with a Bayesian\noptimization (BO) framework, to adaptively sample the parameters for the ROM\ntraining. The finite element method is used for the simulation of the\nhigh-fidelity models. The formulated ROM is used for forward UQ of the GWP in\nan aluminum plate with varying material properties. To determine the influence\nof each parameter perturbation, a global, variance-based sensitivity analysis\nis implemented based on Sobol' indices. It is shown that Bayesian optimization\noutperforms one-shot sampling methods, both in terms of accuracy and speed-up.\nThe predicted results reveal the efficiency of BO-ML-ROM for GWP and\ndemonstrate its value for UQ.\n","authors":["G. I. Drakoulas","T. V. Gortsas","D. Polyzos"],"pdf_url":"https://arxiv.org/pdf/2307.09661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09660v1","updated":"2023-07-18T22:01:08Z","published":"2023-07-18T22:01:08Z","title":"Neural Priority Queues for Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have shown considerable success in neural\nalgorithmic reasoning. Many traditional algorithms make use of an explicit\nmemory in the form of a data structure. However, there has been limited\nexploration on augmenting GNNs with external memory. In this paper, we present\nNeural Priority Queues, a differentiable analogue to algorithmic priority\nqueues, for GNNs. We propose and motivate a desiderata for memory modules, and\nshow that Neural PQs exhibit the desiderata, and reason about their use with\nalgorithmic reasoning. This is further demonstrated by empirical results on the\nCLRS-30 dataset. Furthermore, we find the Neural PQs useful in capturing\nlong-range interactions, as empirically shown on a dataset from the Long-Range\nGraph Benchmark.\n","authors":["Rishabh Jain","Petar Veličković","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2307.09660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09653v1","updated":"2023-07-18T21:53:40Z","published":"2023-07-18T21:53:40Z","title":"HAT-CL: A Hard-Attention-to-the-Task PyTorch Library for Continual\n Learning","summary":" Catastrophic forgetting, the phenomenon in which a neural network loses\npreviously obtained knowledge during the learning of new tasks, poses a\nsignificant challenge in continual learning. The Hard-Attention-to-the-Task\n(HAT) mechanism has shown potential in mitigating this problem, but its\npractical implementation has been complicated by issues of usability and\ncompatibility, and a lack of support for existing network reuse. In this paper,\nwe introduce HAT-CL, a user-friendly, PyTorch-compatible redesign of the HAT\nmechanism. HAT-CL not only automates gradient manipulation but also streamlines\nthe transformation of PyTorch modules into HAT modules. It achieves this by\nproviding a comprehensive suite of modules that can be seamlessly integrated\ninto existing architectures. Additionally, HAT-CL offers ready-to-use HAT\nnetworks that are smoothly integrated with the TIMM library. Beyond the\nredesign and reimplementation of HAT, we also introduce novel mask manipulation\ntechniques for HAT, which have consistently shown improvements across various\nexperiments. Our work paves the way for a broader application of the HAT\nmechanism, opening up new possibilities in continual learning across diverse\nmodels and applications.\n","authors":["Xiaotian Duan"],"pdf_url":"https://arxiv.org/pdf/2307.09653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09649v1","updated":"2023-07-18T21:39:39Z","published":"2023-07-18T21:39:39Z","title":"Application of BadNets in Spam Filters","summary":" Spam filters are a crucial component of modern email systems, as they help to\nprotect users from unwanted and potentially harmful emails. However, the\neffectiveness of these filters is dependent on the quality of the machine\nlearning models that power them. In this paper, we design backdoor attacks in\nthe domain of spam filtering. By demonstrating the potential vulnerabilities in\nthe machine learning model supply chain, we highlight the need for careful\nconsideration and evaluation of the models used in spam filters. Our results\nshow that the backdoor attacks can be effectively used to identify\nvulnerabilities in spam filters and suggest the need for ongoing monitoring and\nimprovement in this area.\n","authors":["Swagnik Roychoudhury","Akshaj Kumar Veldanda"],"pdf_url":"https://arxiv.org/pdf/2307.09649v1.pdf","comment":"5 pages, 4 figures, submitted to ICDE23 ASTRIDE,\n https://astride-2023.github.io/assets/papers/CameraReady14.pdf"},{"id":"http://arxiv.org/abs/2307.06975v2","updated":"2023-07-18T21:27:25Z","published":"2023-07-13T13:52:41Z","title":"Neuro-symbolic Empowered Denoising Diffusion Probabilistic Models for\n Real-time Anomaly Detection in Industry 4.0","summary":" Industry 4.0 involves the integration of digital technologies, such as IoT,\nBig Data, and AI, into manufacturing and industrial processes to increase\nefficiency and productivity. As these technologies become more interconnected\nand interdependent, Industry 4.0 systems become more complex, which brings the\ndifficulty of identifying and stopping anomalies that may cause disturbances in\nthe manufacturing process. This paper aims to propose a diffusion-based model\nfor real-time anomaly prediction in Industry 4.0 processes. Using a\nneuro-symbolic approach, we integrate industrial ontologies in the model,\nthereby adding formal knowledge on smart manufacturing. Finally, we propose a\nsimple yet effective way of distilling diffusion models through Random Fourier\nFeatures for deployment on an embedded system for direct integration into the\nmanufacturing process. To the best of our knowledge, this approach has never\nbeen explored before.\n","authors":["Luigi Capogrosso","Alessio Mascolini","Federico Girella","Geri Skenderi","Sebastiano Gaiardelli","Nicola Dall'Ora","Francesco Ponzio","Enrico Fraccaroli","Santa Di Cataldo","Sara Vinco","Enrico Macii","Franco Fummi","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2307.06975v2.pdf","comment":"Accepted at the 26th Forum on specification and Design Languages (FDL\n 2023)"},{"id":"http://arxiv.org/abs/2103.15965v3","updated":"2023-07-18T20:59:55Z","published":"2021-03-29T21:40:58Z","title":"Strong Optimal Classification Trees","summary":" Decision trees are among the most popular machine learning models and are\nused routinely in applications ranging from revenue management and medicine to\nbioinformatics. In this paper, we consider the problem of learning optimal\nbinary classification trees with univariate splits. Literature on the topic has\nburgeoned in recent years, motivated both by the empirical suboptimality of\nheuristic approaches and the tremendous improvements in mixed-integer\noptimization (MIO) technology. Yet, existing MIO-based approaches from the\nliterature do not leverage the power of MIO to its full extent: they rely on\nweak formulations, resulting in slow convergence and large optimality gaps. To\nfill this gap in the literature, we propose an intuitive flow-based MIO\nformulation for learning optimal binary classification trees. Our formulation\ncan accommodate side constraints to enable the design of interpretable and fair\ndecision trees. Moreover, we show that our formulation has a stronger linear\noptimization relaxation than existing methods in the case of binary data. We\nexploit the decomposable structure of our formulation and max-flow/min-cut\nduality to derive a Benders' decomposition method to speed-up computation. We\npropose a tailored procedure for solving each decomposed subproblem that\nprovably generates facets of the feasible set of the MIO as constraints to add\nto the main problem. We conduct extensive computational experiments on standard\nbenchmark datasets on which we show that our proposed approaches are 29 times\nfaster than state-of-the-art MIO-based techniques and improve out-of-sample\nperformance by up to 8%.\n","authors":["Sina Aghaei","Andrés Gómez","Phebe Vayanos"],"pdf_url":"https://arxiv.org/pdf/2103.15965v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09638v1","updated":"2023-07-18T20:59:52Z","published":"2023-07-18T20:59:52Z","title":"Promoting Exploration in Memory-Augmented Adam using Critical Momenta","summary":" Adaptive gradient-based optimizers, particularly Adam, have left their mark\nin training large-scale deep learning models. The strength of such optimizers\nis that they exhibit fast convergence while being more robust to hyperparameter\nchoice. However, they often generalize worse than non-adaptive methods. Recent\nstudies have tied this performance gap to flat minima selection: adaptive\nmethods tend to find solutions in sharper basins of the loss landscape, which\nin turn hurts generalization. To overcome this issue, we propose a new\nmemory-augmented version of Adam that promotes exploration towards flatter\nminima by using a buffer of critical momentum terms during training.\nIntuitively, the use of the buffer makes the optimizer overshoot outside the\nbasin of attraction if it is not wide enough. We empirically show that our\nmethod improves the performance of several variants of Adam on standard\nsupervised language modelling and image classification tasks.\n","authors":["Pranshu Malviya","Gonçalo Mordido","Aristide Baratin","Reza Babanezhad Harikandeh","Jerry Huang","Simon Lacoste-Julien","Razvan Pascanu","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2307.09638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.06960v2","updated":"2023-07-18T20:41:53Z","published":"2021-07-14T19:45:49Z","title":"MAFAT: Memory-Aware Fusing and Tiling of Neural Networks for Accelerated\n Edge Inference","summary":" A rising research challenge is running costly machine learning (ML) networks\nlocally on resource-constrained edge devices. ML networks with large\nconvolutional layers can easily exceed available memory, increasing latency due\nto excessive OS swapping. Previous memory reduction techniques such as pruning\nand quantization reduce model accuracy and often require retraining.\nAlternatively, distributed methods partition the convolutions into equivalent\nsmaller sub-computations, but the implementations introduce communication costs\nand require a network of devices. Distributed partitioning approaches can,\nhowever, also be used to run in a reduced memory footprint on a single device\nby subdividing the network into smaller operations. In this paper, we extend\nprior work on distributed partitioning into a memory-aware execution on a\nsingle device. Our approach extends prior fusing strategies to allow for\nmultiple groups of convolutional layers that are fused and tiled independently.\nThis enables trading off overhead versus data reuse in order to specifically\nreduces memory footprint. We propose a memory usage predictor coupled with a\nsearch algorithm to provide optimized fusing and tiling configurations for an\narbitrary set of convolutional layers. When applied to the YOLOv2 object\ndetection network, results show that our approach can run in less than half the\nmemory, and with a speedup of up to 2.78 under severe memory constraints.\nAdditionally, our algorithm will return a configuration with a latency that is\nwithin 6% of the best latency measured in a manual search.\n","authors":["Jackson Farley","Andreas Gerstlauer"],"pdf_url":"https://arxiv.org/pdf/2107.06960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09618v2","updated":"2023-07-18T20:39:43Z","published":"2023-06-16T04:18:04Z","title":"Emergent Asymmetry of Precision and Recall for Measuring Fidelity and\n Diversity of Generative Models in High Dimensions","summary":" Precision and Recall are two prominent metrics of generative performance,\nwhich were proposed to separately measure the fidelity and diversity of\ngenerative models. Given their central role in comparing and improving\ngenerative models, understanding their limitations are crucially important. To\nthat end, in this work, we identify a critical flaw in the common approximation\nof these metrics using k-nearest-neighbors, namely, that the very\ninterpretations of fidelity and diversity that are assigned to Precision and\nRecall can fail in high dimensions, resulting in very misleading conclusions.\nSpecifically, we empirically and theoretically show that as the number of\ndimensions grows, two model distributions with supports at equal point-wise\ndistance from the support of the real distribution, can have vastly different\nPrecision and Recall regardless of their respective distributions, hence an\nemergent asymmetry in high dimensions. Based on our theoretical insights, we\nthen provide simple yet effective modifications to these metrics to construct\nsymmetric metrics regardless of the number of dimensions. Finally, we provide\nexperiments on real-world datasets to illustrate that the identified flaw is\nnot merely a pathological case, and that our proposed metrics are effective in\nalleviating its impact.\n","authors":["Mahyar Khayatkhoei","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2306.09618v2.pdf","comment":"To appear in ICML 2023. Updated proof in Appendix B"},{"id":"http://arxiv.org/abs/2210.04318v5","updated":"2023-07-18T20:31:51Z","published":"2022-10-09T18:58:24Z","title":"Prediction intervals for neural network models using weighted asymmetric\n loss functions","summary":" We propose a simple and efficient approach to generate a prediction intervals\n(PI) for approximated and forecasted trends. Our method leverages a weighted\nasymmetric loss function to estimate the lower and upper bounds of the PI, with\nthe weights determined by its coverage probability. We provide a concise\nmathematical proof of the method, show how it can be extended to derive PIs for\nparametrised functions and discuss its effectiveness when training deep neural\nnetworks. The presented tests of the method on a real-world forecasting task\nusing a neural network-based model show that it can produce reliable PIs in\ncomplex machine learning scenarios.\n","authors":["Milo Grillo","Yunpeng Han","Agnieszka Werpachowska"],"pdf_url":"https://arxiv.org/pdf/2210.04318v5.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.09619v1","updated":"2023-07-18T20:27:45Z","published":"2023-07-18T20:27:45Z","title":"Towards Federated Foundation Models: Scalable Dataset Pipelines for\n Group-Structured Learning","summary":" We introduce a library, Dataset Grouper, to create large-scale\ngroup-structured (e.g., federated) datasets, enabling federated learning\nsimulation at the scale of foundation models. This library allows the creation\nof group-structured versions of existing datasets based on user-specified\npartitions, and directly leads to a variety of useful heterogeneous datasets\nthat can be plugged into existing software frameworks. Dataset Grouper offers\nthree key advantages. First, it scales to settings where even a single group's\ndataset is too large to fit in memory. Second, it provides flexibility, both in\nchoosing the base (non-partitioned) dataset and in defining partitions.\nFinally, it is framework-agnostic. We empirically demonstrate that Dataset\nGrouper allows for large-scale federated language modeling simulations on\ndatasets that are orders of magnitude larger than in previous work. Our\nexperimental results show that algorithms like FedAvg operate more as\nmeta-learning methods than as empirical risk minimization methods at this\nscale, suggesting their utility in downstream personalization and task-specific\nadaptation.\n","authors":["Zachary Charles","Nicole Mitchell","Krishna Pillutla","Michael Reneer","Zachary Garrett"],"pdf_url":"https://arxiv.org/pdf/2307.09619v1.pdf","comment":"Dataset Grouper is available at\n https://github.com/google-research/dataset_grouper"},{"id":"http://arxiv.org/abs/2206.09311v5","updated":"2023-07-18T20:19:49Z","published":"2022-06-19T02:33:14Z","title":"Primal Estimated Subgradient Solver for SVM for Imbalanced\n Classification","summary":" We aim to demonstrate in experiments that our cost sensitive PEGASOS SVM\nachieves good performance on imbalanced data sets with a Majority to Minority\nRatio ranging from 8.6:1 to 130:1 and to ascertain whether the including\nintercept (bias), regularization and parameters affects performance on our\nselection of datasets. Although many resort to SMOTE methods, we aim for a less\ncomputationally intensive method. We evaluate the performance by examining the\nlearning curves. These curves diagnose whether we overfit or underfit or\nwhether the random sample of data chosen during the process was not random\nenough or diverse enough in dependent variable class for the algorithm to\ngeneralized to unseen examples. We will also see the background of the\nhyperparameters versus the test and train error in validation curves. We\nbenchmark our PEGASOS Cost-Sensitive SVM's results of Ding's LINEAR SVM DECIDL\nmethod. He obtained an ROC-AUC of .5 in one dataset. Our work will extend the\nwork of Ding by incorporating kernels into SVM. We will use Python rather than\nMATLAB as python has dictionaries for storing mixed data types during\nmulti-parameter cross-validation.\n","authors":["John Sun"],"pdf_url":"https://arxiv.org/pdf/2206.09311v5.pdf","comment":"10 pages, 4 tables, 3 figures"},{"id":"http://arxiv.org/abs/2305.09446v2","updated":"2023-07-18T20:01:42Z","published":"2023-05-16T14:05:30Z","title":"A Probabilistic Transformation of Distance-Based Outliers","summary":" The scores of distance-based outlier detection methods are difficult to\ninterpret, making it challenging to determine a cut-off threshold between\nnormal and outlier data points without additional context. We describe a\ngeneric transformation of distance-based outlier scores into interpretable,\nprobabilistic estimates. The transformation is ranking-stable and increases the\ncontrast between normal and outlier data points. Determining distance\nrelationships between data points is necessary to identify the nearest-neighbor\nrelationships in the data, yet, most of the computed distances are typically\ndiscarded. We show that the distances to other data points can be used to model\ndistance probability distributions and, subsequently, use the distributions to\nturn distance-based outlier scores into outlier probabilities. Our experiments\nshow that the probabilistic transformation does not impact detection\nperformance over numerous tabular and image benchmark datasets but results in\ninterpretable outlier scores with increased contrast between normal and outlier\nsamples. Our work generalizes to a wide range of distance-based outlier\ndetection methods, and because existing distance computations are used, it adds\nno significant computational overhead.\n","authors":["David Muhr","Michael Affenzeller","Josef Küng"],"pdf_url":"https://arxiv.org/pdf/2305.09446v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12549v2","updated":"2023-07-18T20:01:10Z","published":"2022-05-25T07:53:51Z","title":"Learning from time-dependent streaming data with online stochastic\n algorithms","summary":" This paper addresses stochastic optimization in a streaming setting with\ntime-dependent and biased gradient estimates. We analyze several first-order\nmethods, including Stochastic Gradient Descent (SGD), mini-batch SGD, and\ntime-varying mini-batch SGD, along with their Polyak-Ruppert averages. Our\nnon-asymptotic analysis establishes novel heuristics that link dependence,\nbiases, and convexity levels, enabling accelerated convergence. Specifically,\nour findings demonstrate that (i) time-varying mini-batch SGD methods have the\ncapability to break long- and short-range dependence structures, (ii) biased\nSGD methods can achieve comparable performance to their unbiased counterparts,\nand (iii) incorporating Polyak-Ruppert averaging can accelerate the convergence\nof the stochastic optimization algorithms. To validate our theoretical\nfindings, we conduct a series of experiments using both simulated and real-life\ntime-dependent data.\n","authors":["Antoine Godichon-Baggioni","Nicklas Werge","Olivier Wintenberger"],"pdf_url":"https://arxiv.org/pdf/2205.12549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09591v1","updated":"2023-07-18T19:56:20Z","published":"2023-07-18T19:56:20Z","title":"Gradient strikes back: How filtering out high frequencies improves\n explanations","summary":" Recent years have witnessed an explosion in the development of novel\nprediction-based attribution methods, which have slowly been supplanting older\ngradient-based methods to explain the decisions of deep neural networks.\nHowever, it is still not clear why prediction-based methods outperform\ngradient-based ones. Here, we start with an empirical observation: these two\napproaches yield attribution maps with very different power spectra, with\ngradient-based methods revealing more high-frequency content than\nprediction-based methods. This observation raises multiple questions: What is\nthe source of this high-frequency information, and does it truly reflect\ndecisions made by the system? Lastly, why would the absence of high-frequency\ninformation in prediction-based methods yield better explainability scores\nalong multiple metrics? We analyze the gradient of three representative visual\nclassification models and observe that it contains noisy information emanating\nfrom high-frequencies. Furthermore, our analysis reveals that the operations\nused in Convolutional Neural Networks (CNNs) for downsampling appear to be a\nsignificant source of this high-frequency content -- suggesting aliasing as a\npossible underlying basis. We then apply an optimal low-pass filter for\nattribution maps and demonstrate that it improves gradient-based attribution\nmethods. We show that (i) removing high-frequency noise yields significant\nimprovements in the explainability scores obtained with gradient-based methods\nacross multiple models -- leading to (ii) a novel ranking of state-of-the-art\nmethods with gradient-based methods at the top. We believe that our results\nwill spur renewed interest in simpler and computationally more efficient\ngradient-based methods for explainability.\n","authors":["Sabine Muzellec","Leo Andeol","Thomas Fel","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2307.09591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09552v1","updated":"2023-07-18T18:59:42Z","published":"2023-07-18T18:59:42Z","title":"Self-Compatibility: Evaluating Causal Discovery without Ground Truth","summary":" As causal ground truth is incredibly rare, causal discovery algorithms are\ncommonly only evaluated on simulated data. This is concerning, given that\nsimulations reflect common preconceptions about generating processes regarding\nnoise distributions, model classes, and more. In this work, we propose a novel\nmethod for falsifying the output of a causal discovery algorithm in the absence\nof ground truth. Our key insight is that while statistical learning seeks\nstability across subsets of data points, causal learning should seek stability\nacross subsets of variables. Motivated by this insight, our method relies on a\nnotion of compatibility between causal graphs learned on different subsets of\nvariables. We prove that detecting incompatibilities can falsify wrongly\ninferred causal relations due to violation of assumptions or errors from finite\nsample effects. Although passing such compatibility tests is only a necessary\ncriterion for good performance, we argue that it provides strong evidence for\nthe causal models whenever compatibility entails strong implications for the\njoint distribution. We also demonstrate experimentally that detection of\nincompatibilities can aid in causal model selection.\n","authors":["Philipp M. Faller","Leena Chennuru Vankadara","Atalanti A. Mastakouri","Francesco Locatello","Dominik Janzing"],"pdf_url":"https://arxiv.org/pdf/2307.09552v1.pdf","comment":"28 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.09550v1","updated":"2023-07-18T18:48:54Z","published":"2023-07-18T18:48:54Z","title":"The semantic landscape paradigm for neural networks","summary":" Deep neural networks exhibit a fascinating spectrum of phenomena ranging from\npredictable scaling laws to the unpredictable emergence of new capabilities as\na function of training time, dataset size and network size. Analysis of these\nphenomena has revealed the existence of concepts and algorithms encoded within\nthe learned representations of these networks. While significant strides have\nbeen made in explaining observed phenomena separately, a unified framework for\nunderstanding, dissecting, and predicting the performance of neural networks is\nlacking. Here, we introduce the semantic landscape paradigm, a conceptual and\nmathematical framework that describes the training dynamics of neural networks\nas trajectories on a graph whose nodes correspond to emergent algorithms that\nare instrinsic to the learned representations of the networks. This abstraction\nenables us to describe a wide range of neural network phenomena in terms of\nwell studied problems in statistical physics. Specifically, we show that\ngrokking and emergence with scale are associated with percolation phenomena,\nand neural scaling laws are explainable in terms of the statistics of random\nwalks on graphs. Finally, we discuss how the semantic landscape paradigm\ncomplements existing theoretical and practical approaches aimed at\nunderstanding and interpreting deep neural networks.\n","authors":["Shreyas Gokhale"],"pdf_url":"https://arxiv.org/pdf/2307.09550v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.09547v1","updated":"2023-07-18T18:46:07Z","published":"2023-07-18T18:46:07Z","title":"DreaMR: Diffusion-driven Counterfactual Explanation for Functional MRI","summary":" Deep learning analyses have offered sensitivity leaps in detection of\ncognitive states from functional MRI (fMRI) measurements across the brain. Yet,\nas deep models perform hierarchical nonlinear transformations on their input,\ninterpreting the association between brain responses and cognitive states is\nchallenging. Among common explanation approaches for deep fMRI classifiers,\nattribution methods show poor specificity and perturbation methods show limited\nplausibility. While counterfactual generation promises to address these\nlimitations, previous methods use variational or adversarial priors that yield\nsuboptimal sample fidelity. Here, we introduce the first diffusion-driven\ncounterfactual method, DreaMR, to enable fMRI interpretation with high\nspecificity, plausibility and fidelity. DreaMR performs diffusion-based\nresampling of an input fMRI sample to alter the decision of a downstream\nclassifier, and then computes the minimal difference between the original and\ncounterfactual samples for explanation. Unlike conventional diffusion methods,\nDreaMR leverages a novel fractional multi-phase-distilled diffusion prior to\nimprove sampling efficiency without compromising fidelity, and it employs a\ntransformer architecture to account for long-range spatiotemporal context in\nfMRI scans. Comprehensive experiments on neuroimaging datasets demonstrate the\nsuperior specificity, fidelity and efficiency of DreaMR in sample generation\nover state-of-the-art counterfactual methods for fMRI interpretation.\n","authors":["Hasan Atakan Bedel","Tolga Çukur"],"pdf_url":"https://arxiv.org/pdf/2307.09547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13601v4","updated":"2023-07-18T18:37:06Z","published":"2022-10-24T20:55:21Z","title":"Active Learning for Single Neuron Models with Lipschitz Non-Linearities","summary":" We consider the problem of active learning for single neuron models, also\nsometimes called ``ridge functions'', in the agnostic setting (under\nadversarial label noise). Such models have been shown to be broadly effective\nin modeling physical phenomena, and for constructing surrogate data-driven\nmodels for partial differential equations.\n Surprisingly, we show that for a single neuron model with any Lipschitz\nnon-linearity (such as the ReLU, sigmoid, absolute value, low-degree\npolynomial, among others), strong provable approximation guarantees can be\nobtained using a well-known active learning strategy for fitting \\emph{linear\nfunctions} in the agnostic setting. % -- i.e. for the case when there is no\nnon-linearity. Namely, we can collect samples via statistical \\emph{leverage\nscore sampling}, which has been shown to be near-optimal in other active\nlearning scenarios. We support our theoretical results with empirical\nsimulations showing that our proposed active learning strategy based on\nleverage score sampling outperforms (ordinary) uniform sampling when fitting\nsingle neuron models.\n","authors":["Aarshvi Gajjar","Chinmay Hegde","Christopher Musco"],"pdf_url":"https://arxiv.org/pdf/2210.13601v4.pdf","comment":"Inadvertently submitting an incorrect writeup that does not align\n with the intended content"},{"id":"http://arxiv.org/abs/2307.09542v1","updated":"2023-07-18T18:36:29Z","published":"2023-07-18T18:36:29Z","title":"Can Neural Network Memorization Be Localized?","summary":" Recent efforts at explaining the interplay of memorization and generalization\nin deep overparametrized networks have posited that neural networks\n$\\textit{memorize}$ \"hard\" examples in the final few layers of the model.\nMemorization refers to the ability to correctly predict on $\\textit{atypical}$\nexamples of the training set. In this work, we show that rather than being\nconfined to individual layers, memorization is a phenomenon confined to a small\nset of neurons in various layers of the model. First, via three experimental\nsources of converging evidence, we find that most layers are redundant for the\nmemorization of examples and the layers that contribute to example memorization\nare, in general, not the final layers. The three sources are $\\textit{gradient\naccounting}$ (measuring the contribution to the gradient norms from memorized\nand clean examples), $\\textit{layer rewinding}$ (replacing specific model\nweights of a converged model with previous training checkpoints), and\n$\\textit{retraining}$ (training rewound layers only on clean examples). Second,\nwe ask a more generic question: can memorization be localized\n$\\textit{anywhere}$ in a model? We discover that memorization is often confined\nto a small number of neurons or channels (around 5) of the model. Based on\nthese insights we propose a new form of dropout -- $\\textit{example-tied\ndropout}$ that enables us to direct the memorization of examples to an apriori\ndetermined set of neurons. By dropping out these neurons, we are able to reduce\nthe accuracy on memorized examples from $100\\%\\to3\\%$, while also reducing the\ngeneralization gap.\n","authors":["Pratyush Maini","Michael C. Mozer","Hanie Sedghi","Zachary C. Lipton","J. Zico Kolter","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09542v1.pdf","comment":"Accepted at ICML 2023"},{"id":"http://arxiv.org/abs/2307.04988v2","updated":"2023-07-18T18:24:08Z","published":"2023-07-11T02:58:10Z","title":"Benchmarking Bayesian Causal Discovery Methods for Downstream Treatment\n Effect Estimation","summary":" The practical utility of causality in decision-making is widespread and\nbrought about by the intertwining of causal discovery and causal inference.\nNevertheless, a notable gap exists in the evaluation of causal discovery\nmethods, where insufficient emphasis is placed on downstream inference. To\naddress this gap, we evaluate seven established baseline causal discovery\nmethods including a newly proposed method based on GFlowNets, on the downstream\ntask of treatment effect estimation. Through the implementation of a\ndistribution-level evaluation, we offer valuable and unique insights into the\nefficacy of these causal discovery methods for treatment effect estimation,\nconsidering both synthetic and real-world scenarios, as well as low-data\nscenarios. The results of our study demonstrate that some of the algorithms\nstudied are able to effectively capture a wide range of useful and diverse ATE\nmodes, while some tend to learn many low-probability modes which impacts the\n(unrelaxed) recall and precision.\n","authors":["Chris Chinenye Emezue","Alexandre Drouin","Tristan Deleu","Stefan Bauer","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2307.04988v2.pdf","comment":"Peer-Reviewed and Accepted to ICML 2023 Workshop on Structured\n Probabilistic Inference & Generative Modeling"},{"id":"http://arxiv.org/abs/2307.01158v2","updated":"2023-07-18T18:04:08Z","published":"2023-07-03T17:07:18Z","title":"Theory of Mind as Intrinsic Motivation for Multi-Agent Reinforcement\n Learning","summary":" The ability to model the mental states of others is crucial to human social\nintelligence, and can offer similar benefits to artificial agents with respect\nto the social dynamics induced in multi-agent settings. We present a method of\ngrounding semantically meaningful, human-interpretable beliefs within policies\nmodeled by deep networks. We then consider the task of 2nd-order belief\nprediction. We propose that ability of each agent to predict the beliefs of the\nother agents can be used as an intrinsic reward signal for multi-agent\nreinforcement learning. Finally, we present preliminary empirical results in a\nmixed cooperative-competitive environment.\n","authors":["Ini Oguntola","Joseph Campbell","Simon Stepputtis","Katia Sycara"],"pdf_url":"https://arxiv.org/pdf/2307.01158v2.pdf","comment":"To appear at ICML 2023 Workshop on Theory of Mind"}],"Multimedia":[{"id":"http://arxiv.org/abs/2207.02159v4","updated":"2023-07-18T17:23:25Z","published":"2022-07-05T16:26:05Z","title":"Robustness Analysis of Video-Language Models Against Visual and Language\n Perturbations","summary":" Joint visual and language modeling on large-scale datasets has recently shown\ngood progress in multi-modal tasks when compared to single modal learning.\nHowever, robustness of these approaches against real-world perturbations has\nnot been studied. In this work, we perform the first extensive robustness study\nof video-language models against various real-world perturbations. We focus on\ntext-to-video retrieval and propose two large-scale benchmark datasets,\nMSRVTT-P and YouCook2-P, which utilize 90 different visual and 35 different\ntext perturbations. The study reveals some interesting initial findings from\nthe studied models: 1) models are generally more susceptible when only video is\nperturbed as opposed to when only text is perturbed, 2) models that are\npre-trained are more robust than those trained from scratch, 3) models attend\nmore to scene and objects rather than motion and action. We hope this study\nwill serve as a benchmark and guide future research in robust video-language\nlearning. The benchmark introduced in this study along with the code and\ndatasets is available at https://bit.ly/3CNOly4.\n","authors":["Madeline C. Schiappa","Shruti Vyas","Hamid Palangi","Yogesh S. Rawat","Vibhav Vineet"],"pdf_url":"https://arxiv.org/pdf/2207.02159v4.pdf","comment":"NeurIPS 2022 Datasets and Benchmarks Track. This projects webpage is\n located at https://bit.ly/3CNOly4"},{"id":"http://arxiv.org/abs/2307.09312v1","updated":"2023-07-18T14:57:12Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks. In contrast to traditional text-only methods, our approach to\nlabelling a comment as hate speech centers around the holistic analysis of text\nand images. This is done by leveraging graph transformers to capture the\ncontextual relationships in the entire discussion that surrounds a comment,\nwith interwoven fusion layers to combine text and image embeddings instead of\nprocessing different modalities separately. We compare the performance of our\nmodel to baselines that only process text; we also conduct extensive ablation\nstudies. We conclude with future work for multimodal solutions to deliver\nsocial value in online contexts, arguing that capturing a holistic view of a\nconversation greatly advances the effort to detect anti-social behavior.\n","authors":["Liam Hebert","Gaurav Sahu","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v1.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2307.08987v1","updated":"2023-07-18T05:57:00Z","published":"2023-07-18T05:57:00Z","title":"AI-assisted Improved Service Provisioning for Low-latency XR over 5G NR","summary":" Extended Reality (XR) is one of the most important 5G/6G media applications\nthat will fundamentally transform human interactions. However, ensuring low\nlatency, high data rate, and reliability to support XR services poses\nsignificant challenges. This letter presents a novel AI-assisted service\nprovisioning scheme that leverages predicted frames for processing rather than\nrelying solely on actual frames. This method virtually increases the network\ndelay budget and consequently improves service provisioning, albeit at the\nexpense of minor prediction errors. The proposed scheme is validated by\nextensive simulations demonstrating a multi-fold increase in supported XR users\nand also provides crucial network design insights.\n","authors":["Moyukh Laha","Dibbendu Roy","Sourav Dutta","Goutam Das"],"pdf_url":"https://arxiv.org/pdf/2307.08987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11100v1","updated":"2023-07-18T02:20:46Z","published":"2023-07-18T02:20:46Z","title":"CSSL-RHA: Contrastive Self-Supervised Learning for Robust Handwriting\n Authentication","summary":" Handwriting authentication is a valuable tool used in various fields, such as\nfraud prevention and cultural heritage protection. However, it remains a\nchallenging task due to the complex features, severe damage, and lack of\nsupervision. In this paper, we propose a novel Contrastive Self-Supervised\nLearning framework for Robust Handwriting Authentication (CSSL-RHA) to address\nthese issues. It can dynamically learn complex yet important features and\naccurately predict writer identities. Specifically, to remove the negative\neffects of imperfections and redundancy, we design an information-theoretic\nfilter for pre-processing and propose a novel adaptive matching scheme to\nrepresent images as patches of local regions dominated by more important\nfeatures. Through online optimization at inference time, the most informative\npatch embeddings are identified as the \"most important\" elements. Furthermore,\nwe employ contrastive self-supervised training with a momentum-based paradigm\nto learn more general statistical structures of handwritten data without\nsupervision. We conduct extensive experiments on five benchmark datasets and\nour manually annotated dataset EN-HA, which demonstrate the superiority of our\nCSSL-RHA compared to baselines. Additionally, we show that our proposed model\ncan still effectively achieve authentication even under abnormal circumstances,\nsuch as data falsification and corruption.\n","authors":["Jingyao Wang","Luntian Mou","Changwen Zheng","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2307.11100v1.pdf","comment":"10 pages, 4 figures, 3 tables, submitted to ACM MM 2023"}]},"2023-07-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.10172v1","updated":"2023-07-19T17:57:53Z","published":"2023-07-19T17:57:53Z","title":"DialogStudio: Towards Richest and Most Diverse Unified Dataset\n Collection for Conversational AI","summary":" Despite advancements in conversational AI, language models encounter\nchallenges to handle diverse conversational tasks, and existing dialogue\ndataset collections often lack diversity and comprehensiveness. To tackle these\nissues, we introduce DialogStudio: the largest and most diverse collection of\ndialogue datasets, unified under a consistent format while preserving their\noriginal information. Our collection encompasses data from open-domain\ndialogues, task-oriented dialogues, natural language understanding,\nconversational recommendation, dialogue summarization, and knowledge-grounded\ndialogues, making it an incredibly rich and diverse resource for dialogue\nresearch and model training. To further enhance the utility of DialogStudio, we\nidentify the licenses for each dataset and design domain-aware prompts for\nselected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we\ndevelop conversational AI models using the dataset collection, and our\nexperiments in both zero-shot and few-shot learning scenarios demonstrate the\nsuperiority of DialogStudio. To improve transparency and support dataset and\ntask-based research, as well as language model pre-training, all datasets,\nlicenses, codes, and models associated with DialogStudio are made publicly\naccessible at https://github.com/salesforce/DialogStudio\n","authors":["Jianguo Zhang","Kun Qian","Zhiwei Liu","Shelby Heinecke","Rui Meng","Ye Liu","Zhou Yu","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2307.10172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10169v1","updated":"2023-07-19T17:55:13Z","published":"2023-07-19T17:55:13Z","title":"Challenges and Applications of Large Language Models","summary":" Large Language Models (LLMs) went from non-existent to ubiquitous in the\nmachine learning discourse within a few years. Due to the fast pace of the\nfield, it is difficult to identify the remaining challenges and already\nfruitful application areas. In this paper, we aim to establish a systematic set\nof open problems and application successes so that ML researchers can\ncomprehend the field's current state more quickly and become productive.\n","authors":["Jean Kaddour","Joshua Harris","Maximilian Mozes","Herbie Bradley","Roberta Raileanu","Robert McHardy"],"pdf_url":"https://arxiv.org/pdf/2307.10169v1.pdf","comment":"72 pages. v01. Work in progress. Feedback and comments are highly\n appreciated!"},{"id":"http://arxiv.org/abs/2307.10168v1","updated":"2023-07-19T17:54:43Z","published":"2023-07-19T17:54:43Z","title":"LLMs as Workers in Human-Computational Algorithms? Replicating\n Crowdsourcing Pipelines with LLMs","summary":" LLMs have shown promise in replicating human-like behavior in crowdsourcing\ntasks that were previously thought to be exclusive to human abilities. However,\ncurrent efforts focus mainly on simple atomic tasks. We explore whether LLMs\ncan replicate more complex crowdsourcing pipelines. We find that modern LLMs\ncan simulate some of crowdworkers' abilities in these \"human computation\nalgorithms,\" but the level of success is variable and influenced by requesters'\nunderstanding of LLM capabilities, the specific skills required for sub-tasks,\nand the optimal interaction modality for performing these sub-tasks. We reflect\non human and LLMs' different sensitivities to instructions, stress the\nimportance of enabling human-facing safeguards for LLMs, and discuss the\npotential of training humans and LLMs with complementary skill sets. Crucially,\nwe show that replicating crowdsourcing pipelines offers a valuable platform to\ninvestigate (1) the relative strengths of LLMs on different tasks (by\ncross-comparing their performances on sub-tasks) and (2) LLMs' potential in\ncomplex tasks, where they can complete part of the tasks while leaving others\nto humans.\n","authors":["Tongshuang Wu","Haiyi Zhu","Maya Albayrak","Alexis Axon","Amanda Bertsch","Wenxing Deng","Ziqi Ding","Bill Guo","Sireesh Gururaja","Tzu-Sheng Kuo","Jenny T. Liang","Ryan Liu","Ihita Mandal","Jeremiah Milbauer","Xiaolin Ni","Namrata Padmanabhan","Subhashini Ramkumar","Alexis Sudjianto","Jordan Taylor","Ying-Jui Tseng","Patricia Vaidos","Zhijin Wu","Wei Wu","Chenyang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10156v1","updated":"2023-07-19T17:37:03Z","published":"2023-07-19T17:37:03Z","title":"Exploring Transformer Extrapolation","summary":" Length extrapolation has attracted considerable attention recently since it\nallows transformers to be tested on longer sequences than those used in\ntraining. Previous research has shown that this property can be attained by\nusing carefully designed Relative Positional Encodings (RPEs). While these\nmethods perform well on a variety of corpora, the conditions for length\nextrapolation have yet to be investigated. This paper attempts to determine\nwhat types of RPEs allow for length extrapolation through a thorough\nmathematical and empirical analysis. We discover that a transformer is certain\nto possess this property as long as the series that corresponds to the RPE's\nexponential converges. Two practices are derived from the conditions and\nexamined in language modeling tasks on a variety of corpora. As a bonus from\nthe conditions, we derive a new Theoretical Receptive Field (TRF) to measure\nthe receptive field of RPEs without taking any training steps. Extensive\nexperiments are conducted on the Wikitext-103, Books, Github, and WikiBook\ndatasets to demonstrate the viability of our discovered conditions. We also\ncompare TRF to Empirical Receptive Field (ERF) across different models, showing\nconsistently matched trends on the aforementioned datasets. The code is\navailable at https://github.com/OpenNLPLab/Rpe.\n","authors":["Zhen Qin","Yiran Zhong","Hui Deng"],"pdf_url":"https://arxiv.org/pdf/2307.10156v1.pdf","comment":"Zhen Qin and Yiran Zhong contribute equally to this paper; Yiran\n Zhong is the corresponding author. The code is available at\n https://github.com/OpenNLPLab/Rpe"},{"id":"http://arxiv.org/abs/2307.09288v2","updated":"2023-07-19T17:08:59Z","published":"2023-07-18T14:31:57Z","title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","summary":" In this work, we develop and release Llama 2, a collection of pretrained and\nfine-tuned large language models (LLMs) ranging in scale from 7 billion to 70\nbillion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for\ndialogue use cases. Our models outperform open-source chat models on most\nbenchmarks we tested, and based on our human evaluations for helpfulness and\nsafety, may be a suitable substitute for closed-source models. We provide a\ndetailed description of our approach to fine-tuning and safety improvements of\nLlama 2-Chat in order to enable the community to build on our work and\ncontribute to the responsible development of LLMs.\n","authors":["Hugo Touvron","Louis Martin","Kevin Stone","Peter Albert","Amjad Almahairi","Yasmine Babaei","Nikolay Bashlykov","Soumya Batra","Prajjwal Bhargava","Shruti Bhosale","Dan Bikel","Lukas Blecher","Cristian Canton Ferrer","Moya Chen","Guillem Cucurull","David Esiobu","Jude Fernandes","Jeremy Fu","Wenyin Fu","Brian Fuller","Cynthia Gao","Vedanuj Goswami","Naman Goyal","Anthony Hartshorn","Saghar Hosseini","Rui Hou","Hakan Inan","Marcin Kardas","Viktor Kerkez","Madian Khabsa","Isabel Kloumann","Artem Korenev","Punit Singh Koura","Marie-Anne Lachaux","Thibaut Lavril","Jenya Lee","Diana Liskovich","Yinghai Lu","Yuning Mao","Xavier Martinet","Todor Mihaylov","Pushkar Mishra","Igor Molybog","Yixin Nie","Andrew Poulton","Jeremy Reizenstein","Rashi Rungta","Kalyan Saladi","Alan Schelten","Ruan Silva","Eric Michael Smith","Ranjan Subramanian","Xiaoqing Ellen Tan","Binh Tang","Ross Taylor","Adina Williams","Jian Xiang Kuan","Puxin Xu","Zheng Yan","Iliyan Zarov","Yuchen Zhang","Angela Fan","Melanie Kambadur","Sharan Narang","Aurelien Rodriguez","Robert Stojnic","Sergey Edunov","Thomas Scialom"],"pdf_url":"https://arxiv.org/pdf/2307.09288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10098v1","updated":"2023-07-19T16:13:13Z","published":"2023-07-19T16:13:13Z","title":"Gradient Sparsification For Masked Fine-Tuning of Transformers","summary":" Fine-tuning pretrained self-supervised language models is widely adopted for\ntransfer learning to downstream tasks. Fine-tuning can be achieved by freezing\ngradients of the pretrained network and only updating gradients of a newly\nadded classification layer, or by performing gradient updates on all\nparameters. Gradual unfreezing makes a trade-off between the two by gradually\nunfreezing gradients of whole layers during training. This has been an\neffective strategy to trade-off between storage and training speed with\ngeneralization performance. However, it is not clear whether gradually\nunfreezing layers throughout training is optimal, compared to sparse variants\nof gradual unfreezing which may improve fine-tuning performance. In this paper,\nwe propose to stochastically mask gradients to regularize pretrained language\nmodels for improving overall fine-tuned performance. We introduce GradDrop and\nvariants thereof, a class of gradient sparsification methods that mask\ngradients during the backward pass, acting as gradient noise. GradDrop is\nsparse and stochastic unlike gradual freezing. Extensive experiments on the\nmultilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive\nagainst methods that use additional translated data for intermediate\npretraining and outperforms standard fine-tuning and gradual unfreezing. A\npost-analysis shows how GradDrop improves performance with languages it was not\ntrained on, such as under-resourced languages.\n","authors":["James O' Neill","Sourav Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10098v1.pdf","comment":"Accepted to IJCNN 2023"},{"id":"http://arxiv.org/abs/2307.10088v1","updated":"2023-07-19T15:57:24Z","published":"2023-07-19T15:57:24Z","title":"Android in the Wild: A Large-Scale Dataset for Android Device Control","summary":" There is a growing interest in device-control systems that can interpret\nhuman natural language instructions and execute them on a digital device by\ndirectly controlling its user interface. We present a dataset for\ndevice-control research, Android in the Wild (AITW), which is orders of\nmagnitude larger than current datasets. The dataset contains human\ndemonstrations of device interactions, including the screens and actions, and\ncorresponding natural language instructions. It consists of 715k episodes\nspanning 30k unique instructions, four versions of Android (v10-13),and eight\ndevice types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It\ncontains multi-step tasks that require semantic understanding of language and\nvisual context. This dataset poses a new challenge: actions available through\nthe user interface must be inferred from their visual appearance. And, instead\nof simple UI element-based actions, the action space consists of precise\ngestures (e.g., horizontal scrolls to operate carousel widgets). We organize\nour dataset to encourage robustness analysis of device-control systems, i.e.,\nhow well a system performs in the presence of new task descriptions, new\napplications, or new platform versions. We develop two agents and report\nperformance across the dataset. The dataset is available at\nhttps://github.com/google-research/google-research/tree/master/android_in_the_wild.\n","authors":["Christopher Rawles","Alice Li","Daniel Rodriguez","Oriana Riva","Timothy Lillicrap"],"pdf_url":"https://arxiv.org/pdf/2307.10088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11596v3","updated":"2023-07-19T15:25:37Z","published":"2023-01-27T08:45:53Z","title":"ThoughtSource: A central hub for large language model reasoning data","summary":" Large language models (LLMs) such as GPT-4 have recently demonstrated\nimpressive results across a wide range of tasks. LLMs are still limited,\nhowever, in that they frequently fail at complex reasoning, their reasoning\nprocesses are opaque, they are prone to 'hallucinate' facts, and there are\nconcerns about their underlying biases. Letting models verbalize reasoning\nsteps as natural language, a technique known as chain-of-thought prompting, has\nrecently been proposed as a way to address some of these issues. Here we\npresent ThoughtSource, a meta-dataset and software library for chain-of-thought\n(CoT) reasoning. The goal of ThoughtSource is to improve future artificial\nintelligence systems by facilitating qualitative understanding of CoTs,\nenabling empirical evaluations, and providing training data. This first release\nof ThoughtSource integrates six scientific/medical, three general-domain and\nfive math word question answering datasets.\n","authors":["Simon Ott","Konstantin Hebenstreit","Valentin Liévin","Christoffer Egeberg Hother","Milad Moradi","Maximilian Mayrhauser","Robert Praas","Ole Winther","Matthias Samwald"],"pdf_url":"https://arxiv.org/pdf/2301.11596v3.pdf","comment":"Revision: added datasets, minor restructuring"},{"id":"http://arxiv.org/abs/2307.10025v1","updated":"2023-07-19T15:09:50Z","published":"2023-07-19T15:09:50Z","title":"An Empirical Study on Fertility Proposals Using Multi-Grined Topic\n Analysis Methods","summary":" Fertility issues are closely related to population security, in 60 years\nChina's population for the first time in a negative growth trend, the change of\nfertility policy is of great concern to the community. 2023 ``two sessions\"\nproposal ``suggests that the country in the form of legislation, the birth of\nthe registration of the cancellation of the marriage restriction\" This topic\nwas once a hot topic on the Internet, and ``unbundling\" the relationship\nbetween birth registration and marriage has become the focus of social debate.\nIn this paper, we adopt co-occurrence semantic analysis, topic analysis and\nsentiment analysis to conduct multi-granularity semantic analysis of microblog\ncomments. It is found that the discussion on the proposal of ``removing\nmarriage restrictions from birth registration\" involves the individual, society\nand the state at three dimensions, and is detailed into social issues such as\npersonal behaviour, social ethics and law, and national policy, with people's\nsentiment inclined to be negative in most of the topics. Based on this, eight\nproposals were made to provide a reference for governmental decision making and\nto form a reference method for researching public opinion on political issues.\n","authors":["Yulin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.10025v1.pdf","comment":"7 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.09456v2","updated":"2023-07-19T14:27:57Z","published":"2023-07-18T17:35:45Z","title":"A comparative analysis of SRGAN models","summary":" In this study, we evaluate the performance of multiple state-of-the-art SRGAN\n(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN\nand EDSR, on a benchmark dataset of real-world images which undergo degradation\nusing a pipeline. Our results show that some models seem to significantly\nincrease the resolution of the input images while preserving their visual\nquality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE\nmodel from huggingface outperforms the remaining candidate models in terms of\nboth quantitative metrics and subjective visual quality assessments with least\ncompute overhead. Specifically, EDSR generates images with higher peak\nsignal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and\nare seen to return high quality OCR results with Tesseract OCR engine. These\nfindings suggest that EDSR is a robust and effective approach for single-image\nsuper-resolution and may be particularly well-suited for applications where\nhigh-quality visual fidelity is critical and optimized compute.\n","authors":["Fatemeh Rezapoor Nikroo","Ajinkya Deshmukh","Anantha Sharma","Adrian Tam","Kaarthik Kumar","Cleo Norris","Aditya Dangi"],"pdf_url":"https://arxiv.org/pdf/2307.09456v2.pdf","comment":"9 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2307.09998v1","updated":"2023-07-19T14:13:02Z","published":"2023-07-19T14:13:02Z","title":"Generating Mathematical Derivations with Large Language Models","summary":" The derivation of mathematical results in specialised fields using Large\nLanguage Models (LLMs) is an emerging research direction that can help identify\nmodels' limitations, and potentially support mathematical discovery. In this\npaper, we leverage a symbolic engine to generate derivations of equations at\nscale, and investigate the capabilities of LLMs when deriving goal equations\nfrom premises. Specifically, we employ in-context learning for GPT and\nfine-tune a range of T5 models to compare the robustness and generalisation of\npre-training strategies to specialised models. Empirical results show that\nfine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and\nout-of-distribution test sets in terms of absolute performance. However, an\nin-depth analysis reveals that the fine-tuned models are more sensitive to\nperturbations involving unseen symbols and (to a lesser extent) changes to\nequation structure. In addition, we analyse 1.7K equations and over 200\nderivations to highlight common reasoning errors such as the inclusion of\nincorrect, irrelevant, and redundant equations, along with the tendency to skip\nderivation steps. Finally, we explore the suitability of existing metrics for\nevaluating mathematical derivations finding evidence that, while they capture\ngeneral properties such as sensitivity to perturbations, they fail to highlight\nfine-grained reasoning errors and essential differences between models.\nOverall, this work demonstrates that training models on synthetic data can\nimprove their mathematical capabilities beyond larger architectures.\n","authors":["Jordan Meadows","Marco Valentino","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2307.09998v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2303.15056v2","updated":"2023-07-19T14:10:55Z","published":"2023-03-27T09:59:48Z","title":"ChatGPT Outperforms Crowd-Workers for Text-Annotation Tasks","summary":" Many NLP applications require manual data annotations for a variety of tasks,\nnotably to train classifiers or evaluate the performance of unsupervised\nmodels. Depending on the size and degree of complexity, the tasks may be\nconducted by crowd-workers on platforms such as MTurk as well as trained\nannotators, such as research assistants. Using a sample of 2,382 tweets, we\ndemonstrate that ChatGPT outperforms crowd-workers for several annotation\ntasks, including relevance, stance, topics, and frames detection. Specifically,\nthe zero-shot accuracy of ChatGPT exceeds that of crowd-workers for four out of\nfive tasks, while ChatGPT's intercoder agreement exceeds that of both\ncrowd-workers and trained annotators for all tasks. Moreover, the\nper-annotation cost of ChatGPT is less than $0.003 -- about twenty times\ncheaper than MTurk. These results show the potential of large language models\nto drastically increase the efficiency of text classification.\n","authors":["Fabrizio Gilardi","Meysam Alizadeh","Maël Kubli"],"pdf_url":"https://arxiv.org/pdf/2303.15056v2.pdf","comment":"Gilardi, Fabrizio, Meysam Alizadeh, and Ma\\\"el Kubli. 2023. \"ChatGPT\n Outperforms Crowd Workers for Text-Annotation Tasks\". Proceedings of the\n National Academy of Sciences 120(30): e2305016120"},{"id":"http://arxiv.org/abs/2210.14037v2","updated":"2023-07-19T13:43:07Z","published":"2022-10-25T14:13:53Z","title":"Revisiting Softmax for Uncertainty Approximation in Text Classification","summary":" Uncertainty approximation in text classification is an important area with\napplications in domain adaptation and interpretability. One of the most widely\nused uncertainty approximation methods is Monte Carlo (MC) Dropout, which is\ncomputationally expensive as it requires multiple forward passes through the\nmodel. A cheaper alternative is to simply use the softmax based on a single\nforward pass without dropout to estimate model uncertainty. However, prior work\nhas indicated that these predictions tend to be overconfident. In this paper,\nwe perform a thorough empirical analysis of these methods on five datasets with\ntwo base neural architectures in order to identify the trade-offs between the\ntwo. We compare both softmax and an efficient version of MC Dropout on their\nuncertainty approximations and downstream text classification performance,\nwhile weighing their runtime (cost) against performance (benefit). We find\nthat, while MC dropout produces the best uncertainty approximations, using a\nsimple softmax leads to competitive and in some cases better uncertainty\nestimation for text classification at a much lower computational cost,\nsuggesting that softmax can in fact be a sufficient uncertainty estimate when\ncomputational resources are a concern.\n","authors":["Andreas Nugaard Holm","Dustin Wright","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2210.14037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09959v1","updated":"2023-07-19T13:01:03Z","published":"2023-07-19T13:01:03Z","title":"GUIDO: A Hybrid Approach to Guideline Discovery & Ordering from Natural\n Language Texts","summary":" Extracting workflow nets from textual descriptions can be used to simplify\nguidelines or formalize textual descriptions of formal processes like business\nprocesses and algorithms. The task of manually extracting processes, however,\nrequires domain expertise and effort. While automatic process model extraction\nis desirable, annotating texts with formalized process models is expensive.\nTherefore, there are only a few machine-learning-based extraction approaches.\nRule-based approaches, in turn, require domain specificity to work well and can\nrarely distinguish relevant and irrelevant information in textual descriptions.\nIn this paper, we present GUIDO, a hybrid approach to the process model\nextraction task that first, classifies sentences regarding their relevance to\nthe process model, using a BERT-based sentence classifier, and second, extracts\na process model from the sentences classified as relevant, using dependency\nparsing. The presented approach achieves significantly better results than a\npure rule-based approach. GUIDO achieves an average behavioral similarity score\nof $0.93$. Still, in comparison to purely machine-learning-based approaches,\nthe annotation costs stay low.\n","authors":["Nils Freyer","Dustin Thewes","Matthias Meinecke"],"pdf_url":"https://arxiv.org/pdf/2307.09959v1.pdf","comment":"Preprint of the short paper presented at the 12th International\n Conference on Data Science, Technology and Applications"},{"id":"http://arxiv.org/abs/2307.02486v2","updated":"2023-07-19T12:25:35Z","published":"2023-07-05T17:59:38Z","title":"LongNet: Scaling Transformers to 1,000,000,000 Tokens","summary":" Scaling sequence length has become a critical demand in the era of large\nlanguage models. However, existing methods struggle with either computational\ncomplexity or model expressivity, rendering the maximum sequence length\nrestricted. To address this issue, we introduce LongNet, a Transformer variant\nthat can scale sequence length to more than 1 billion tokens, without\nsacrificing the performance on shorter sequences. Specifically, we propose\ndilated attention, which expands the attentive field exponentially as the\ndistance grows. LongNet has significant advantages: 1) it has a linear\ncomputation complexity and a logarithm dependency between any two tokens in a\nsequence; 2) it can be served as a distributed trainer for extremely long\nsequences; 3) its dilated attention is a drop-in replacement for standard\nattention, which can be seamlessly integrated with the existing\nTransformer-based optimization. Experiments results demonstrate that LongNet\nyields strong performance on both long-sequence modeling and general language\ntasks. Our work opens up new possibilities for modeling very long sequences,\ne.g., treating a whole corpus or even the entire Internet as a sequence.\n","authors":["Jiayu Ding","Shuming Ma","Li Dong","Xingxing Zhang","Shaohan Huang","Wenhui Wang","Nanning Zheng","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.02486v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.09923v1","updated":"2023-07-19T11:54:46Z","published":"2023-07-19T11:54:46Z","title":"Large Language Models can accomplish Business Process Management Tasks","summary":" Business Process Management (BPM) aims to improve organizational activities\nand their outcomes by managing the underlying processes. To achieve this, it is\noften necessary to consider information from various sources, including\nunstructured textual documents. Therefore, researchers have developed several\nBPM-specific solutions that extract information from textual documents using\nNatural Language Processing techniques. These solutions are specific to their\nrespective tasks and cannot accomplish multiple process-related problems as a\ngeneral-purpose instrument. However, in light of the recent emergence of Large\nLanguage Models (LLMs) with remarkable reasoning capabilities, such a\ngeneral-purpose instrument with multiple applications now appears attainable.\nIn this paper, we illustrate how LLMs can accomplish text-related BPM tasks by\napplying a specific LLM to three exemplary tasks: mining imperative process\nmodels from textual descriptions, mining declarative process models from\ntextual descriptions, and assessing the suitability of process tasks from\ntextual descriptions for robotic process automation. We show that, without\nextensive configuration or prompt engineering, LLMs perform comparably to or\nbetter than existing solutions and discuss implications for future BPM research\nas well as practical usage.\n","authors":["Michael Grohs","Luka Abb","Nourhan Elsayed","Jana-Rebecca Rehse"],"pdf_url":"https://arxiv.org/pdf/2307.09923v1.pdf","comment":"Accepted at NLP4BPM workshop at BPM 2023"},{"id":"http://arxiv.org/abs/2307.09885v1","updated":"2023-07-19T10:28:59Z","published":"2023-07-19T10:28:59Z","title":"Test-takers have a say: understanding the implications of the use of AI\n in language tests","summary":" Language tests measure a person's ability to use a language in terms of\nlistening, speaking, reading, or writing. Such tests play an integral role in\nacademic, professional, and immigration domains, with entities such as\neducational institutions, professional accreditation bodies, and governments\nusing them to assess candidate language proficiency. Recent advances in\nArtificial Intelligence (AI) and the discipline of Natural Language Processing\nhave prompted language test providers to explore AI's potential applicability\nwithin language testing, leading to transformative activity patterns\nsurrounding language instruction and learning. However, with concerns over AI's\ntrustworthiness, it is imperative to understand the implications of integrating\nAI into language testing. This knowledge will enable stakeholders to make\nwell-informed decisions, thus safeguarding community well-being and testing\nintegrity. To understand the concerns and effects of AI usage in language\ntests, we conducted interviews and surveys with English test-takers. To the\nbest of our knowledge, this is the first empirical study aimed at identifying\nthe implications of AI adoption in language tests from a test-taker\nperspective. Our study reveals test-taker perceptions and behavioral patterns.\nSpecifically, we identify that AI integration may enhance perceptions of\nfairness, consistency, and availability. Conversely, it might incite mistrust\nregarding reliability and interactivity aspects, subsequently influencing the\nbehaviors and well-being of test-takers. These insights provide a better\nunderstanding of potential societal implications and assist stakeholders in\nmaking informed decisions concerning AI usage in language testing.\n","authors":["Dawen Zhang","Thong Hoang","Shidong Pan","Yongquan Hu","Zhenchang Xing","Mark Staples","Xiwei Xu","Qinghua Lu","Aaron Quigley"],"pdf_url":"https://arxiv.org/pdf/2307.09885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09416v2","updated":"2023-07-19T08:27:50Z","published":"2023-07-18T16:33:30Z","title":"Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation\n Evaluation","summary":" Research in Image Generation has recently made significant progress,\nparticularly boosted by the introduction of Vision-Language models which are\nable to produce high-quality visual content based on textual inputs. Despite\nongoing advancements in terms of generation quality and realism, no methodical\nframeworks have been defined yet to quantitatively measure the quality of the\ngenerated content and the adherence with the prompted requests: so far, only\nhuman-based evaluations have been adopted for quality satisfaction and for\ncomparing different generative methods. We introduce a novel automated method\nfor Visual Concept Evaluation (ViCE), i.e. to assess consistency between a\ngenerated/edited image and the corresponding prompt/instructions, with a\nprocess inspired by the human cognitive behaviour. ViCE combines the strengths\nof Large Language Models (LLMs) and Visual Question Answering (VQA) into a\nunified pipeline, aiming to replicate the human cognitive process in quality\nassessment. This method outlines visual concepts, formulates image-specific\nverification questions, utilizes the Q&A system to investigate the image, and\nscores the combined outcome. Although this brave new hypothesis of mimicking\nhumans in the image evaluation process is in its preliminary assessment stage,\nresults are promising and open the door to a new form of automatic evaluation\nwhich could have significant impact as the image generation or the image target\nediting tasks become more and more sophisticated.\n","authors":["Federico Betti","Jacopo Staiano","Lorenzo Baraldi","Lorenzo Baraldi","Rita Cucchiara","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2307.09416v2.pdf","comment":"Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track)"},{"id":"http://arxiv.org/abs/2307.09813v1","updated":"2023-07-19T08:02:20Z","published":"2023-07-19T08:02:20Z","title":"DAPrompt: Deterministic Assumption Prompt Learning for Event Causality\n Identification","summary":" Event Causality Identification (ECI) aims at determining whether there is a\ncausal relation between two event mentions. Conventional prompt learning\ndesigns a prompt template to first predict an answer word and then maps it to\nthe final decision. Unlike conventional prompts, we argue that predicting an\nanswer word may not be a necessary prerequisite for the ECI task. Instead, we\ncan first make a deterministic assumption on the existence of causal relation\nbetween two events and then evaluate its rationality to either accept or reject\nthe assumption. The design motivation is to try the most utilization of the\nencyclopedia-like knowledge embedded in a pre-trained language model. In light\nof such considerations, we propose a deterministic assumption prompt learning\nmodel, called DAPrompt, for the ECI task. In particular, we design a simple\ndeterministic assumption template concatenating with the input event pair,\nwhich includes two masks as predicted events' tokens. We use the probabilities\nof predicted events to evaluate the assumption rationality for the final event\ncausality decision. Experiments on the EventStoryLine corpus and\nCausal-TimeBank corpus validate our design objective in terms of significant\nperformance improvements over the state-of-the-art algorithms.\n","authors":["Wei Xiang","Chuanhong Zhan","Bang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09793v1","updated":"2023-07-19T07:17:43Z","published":"2023-07-19T07:17:43Z","title":"On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large\n Language Models","summary":" Since late 2022, Large Language Models (LLMs) have become very prominent with\nLLMs like ChatGPT and Bard receiving millions of users. Hundreds of new LLMs\nare announced each week, many of which are deposited to Hugging Face, a\nrepository of machine learning models and datasets. To date, nearly 16,000 Text\nGeneration models have been uploaded to the site. Given the huge influx of\nLLMs, it is of interest to know which LLM backbones, settings, training\nmethods, and families are popular or trending. However, there is no\ncomprehensive index of LLMs available. We take advantage of the relatively\nsystematic nomenclature of Hugging Face LLMs to perform hierarchical clustering\nand identify communities amongst LLMs using n-grams and term frequency-inverse\ndocument frequency. Our methods successfully identify families of LLMs and\naccurately cluster LLMs into meaningful subgroups. We present a public web\napplication to navigate and explore Constellation, our atlas of 15,821 LLMs.\nConstellation rapidly generates a variety of visualizations, namely\ndendrograms, graphs, word clouds, and scatter plots. Constellation is available\nat the following link: https://constellation.sites.stanford.edu/.\n","authors":["Sarah Gao","Andrew Kean Gao"],"pdf_url":"https://arxiv.org/pdf/2307.09793v1.pdf","comment":"14 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.09782v1","updated":"2023-07-19T06:58:03Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n Using Floating-Point Formats","summary":" In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01692v4","updated":"2023-07-19T06:48:35Z","published":"2022-12-03T21:14:32Z","title":"Can In-context Learners Learn a Reasoning Concept from Demonstrations?","summary":" Language models exhibit an emergent ability to learn a new task from a small\nnumber of input-output demonstrations. However, recent work shows that\nin-context learners largely rely on their pre-trained knowledge, such as the\nsentiment of the labels, instead of learning new associations from the input.\nWe argue that the commonly-used few-shot evaluation using a random selection of\nin-context demonstrations can not disentangle models' reliance on such biases,\nas most of the randomly-selected demonstrations do not present relations\ninformative for prediction beyond exposing the task's input-output\ndistribution.\n Therefore, to evaluate models' in-context learning ability independent of\nmodels' memory, we introduce a Concept-sharing few-shot learning method\nchoosing the demonstrations that share an underlying concept with the predicted\nsample. We extract a set of such concepts from available human explanations and\nmeasure how much models can benefit from presenting these concepts in few-shot\ndemonstrations.\n We find that most of the recent in-context learners can not consistently\nbenefit from the demonstrated concepts, irrespective of the model size.\nHowever, we note that T0 models are more sensitive to exhibited concepts,\nbenefiting from concept-sharing demonstrations in 7 out of 8 evaluation\nscenarios.\n","authors":["Michal Štefánik","Marek Kadlčík"],"pdf_url":"https://arxiv.org/pdf/2212.01692v4.pdf","comment":"Awarded Best Paper at ACL 2023 Natural Language Reasoning and\n Structured Explanations (NLRSE) workshop"},{"id":"http://arxiv.org/abs/2307.08621v2","updated":"2023-07-19T05:56:42Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":" In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10551v3","updated":"2023-07-19T05:52:32Z","published":"2022-12-20T18:54:08Z","title":"Lego-MT: Learning Detachable Models for Massively Multilingual Machine\n Translation","summary":" Multilingual neural machine translation (MNMT) aims to build a unified model\nfor many language directions. Existing monolithic models for MNMT encounter two\nchallenges: parameter interference among languages and inefficient inference\nfor large models. In this paper, we revisit the classic multi-way structures\nand develop a detachable model by assigning each language (or group of\nlanguages) to an individual branch that supports plug-and-play training and\ninference. To address the needs of learning representations for all languages\nin a unified space, we propose a novel efficient training recipe, upon which we\nbuild an effective detachable model, Lego-MT. For a fair comparison, we collect\ndata from OPUS and build a translation benchmark covering 433 languages and\n1.3B parallel data. Experiments show that Lego-MT with 1.2B parameters brings\nan average gain of 3.2 spBLEU. It even outperforms M2M-100 with 12B parameters.\nThe proposed training recipe brings a 28.2$\\times$ speedup over the\nconventional multi-way training method.\\footnote{\n\\url{https://github.com/CONE-MT/Lego-MT}.}\n","authors":["Fei Yuan","Yinquan Lu","WenHao Zhu","Lingpeng Kong","Lei Li","Yu Qiao","Jingjing Xu"],"pdf_url":"https://arxiv.org/pdf/2212.10551v3.pdf","comment":"ACL 2023 Findings"},{"id":"http://arxiv.org/abs/2303.12135v4","updated":"2023-07-19T05:30:31Z","published":"2023-03-21T18:48:11Z","title":"Understand Legal Documents with Contextualized Large Language Models","summary":" The growth of pending legal cases in populous countries, such as India, has\nbecome a major issue. Developing effective techniques to process and understand\nlegal documents is extremely useful in resolving this problem. In this paper,\nwe present our systems for SemEval-2023 Task 6: understanding legal texts (Modi\net al., 2023). Specifically, we first develop the Legal-BERT-HSLN model that\nconsiders the comprehensive context information in both intra- and\ninter-sentence levels to predict rhetorical roles (subtask A) and then train a\nLegal-LUKE model, which is legal-contextualized and entity-aware, to recognize\nlegal entities (subtask B). Our evaluations demonstrate that our designed\nmodels are more accurate than baselines, e.g., with an up to 15.0% better F1\nscore in subtask B. We achieved notable performance in the task leaderboard,\ne.g., 0.834 micro F1 score, and ranked No.5 out of 27 teams in subtask A.\n","authors":["Xin Jin","Yuchen Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12135v4.pdf","comment":"SemEval 2023"},{"id":"http://arxiv.org/abs/2306.07848v5","updated":"2023-07-19T04:56:33Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n Pretraining for Speech Emotion Recognition","summary":" Contrastive learning based cross-modality pretraining methods have recently\nexhibited impressive success in diverse fields. In this paper, we propose\nGEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio\npretraining (CLAP) method for speech emotion recognition. Specifically, a novel\nemotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised\npre-trained models. Second, considering the importance of gender attribute in\nspeech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and\nmulti-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to\nintegrate the emotion and gender information of speech signals, forming more\nreasonable objectives. Extensive experiments on IEMOCAP show that our proposed\ntwo GEmo-CLAP models consistently outperform the baseline Emo-CLAP with\ndifferent pre-trained models, while also achieving the best recognition\nperformance compared with recent state-of-the-art methods. Noticeably, the\nproposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\\% and WAR of\n82.06\\%.\n","authors":["Yu Pan","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2306.07848v5.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2307.09744v1","updated":"2023-07-19T04:25:21Z","published":"2023-07-19T04:25:21Z","title":"Enhancing conversational quality in language learning chatbots: An\n evaluation of GPT4 for ASR error correction","summary":" The integration of natural language processing (NLP) technologies into\neducational applications has shown promising results, particularly in the\nlanguage learning domain. Recently, many spoken open-domain chatbots have been\nused as speaking partners, helping language learners improve their language\nskills. However, one of the significant challenges is the high word-error-rate\n(WER) when recognizing non-native/non-fluent speech, which interrupts\nconversation flow and leads to disappointment for learners. This paper explores\nthe use of GPT4 for ASR error correction in conversational settings. In\naddition to WER, we propose to use semantic textual similarity (STS) and next\nresponse sensibility (NRS) metrics to evaluate the impact of error correction\nmodels on the quality of the conversation. We find that transcriptions\ncorrected by GPT4 lead to higher conversation quality, despite an increase in\nWER. GPT4 also outperforms standard error correction methods without the need\nfor in-domain training data.\n","authors":["Long Mai","Julie Carson-Berndsen"],"pdf_url":"https://arxiv.org/pdf/2307.09744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09455v2","updated":"2023-07-19T04:13:11Z","published":"2023-07-18T17:29:23Z","title":"Pseudo Outlier Exposure for Out-of-Distribution Detection using\n Pretrained Transformers","summary":" For real-world language applications, detecting an out-of-distribution (OOD)\nsample is helpful to alert users or reject such unreliable samples. However,\nmodern over-parameterized language models often produce overconfident\npredictions for both in-distribution (ID) and OOD samples. In particular,\nlanguage models suffer from OOD samples with a similar semantic representation\nto ID samples since these OOD samples lie near the ID manifold. A rejection\nnetwork can be trained with ID and diverse outlier samples to detect test OOD\nsamples, but explicitly collecting auxiliary OOD datasets brings an additional\nburden for data collection. In this paper, we propose a simple but effective\nmethod called Pseudo Outlier Exposure (POE) that constructs a surrogate OOD\ndataset by sequentially masking tokens related to ID classes. The surrogate OOD\nsample introduced by POE shows a similar representation to ID data, which is\nmost effective in training a rejection network. Our method does not require any\nexternal OOD data and can be easily implemented within off-the-shelf\nTransformers. A comprehensive comparison with state-of-the-art algorithms\ndemonstrates POE's competitiveness on several text classification benchmarks.\n","authors":["Jaeyoung Kim","Kyuheon Jung","Dongbin Na","Sion Jang","Eunbin Park","Sungchul Choi"],"pdf_url":"https://arxiv.org/pdf/2307.09455v2.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.09706v1","updated":"2023-07-19T01:37:31Z","published":"2023-07-19T01:37:31Z","title":"RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap","summary":" Taxonomies are an essential knowledge representation, yet most studies on\nautomatic taxonomy construction (ATC) resort to manual evaluation to score\nproposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just\nas important as taxonomy construction. We propose RaTE, an automatic label-free\ntaxonomy scoring procedure, which relies on a large pre-trained language model.\nWe apply our evaluation procedure to three state-of-the-art ATC algorithms with\nwhich we built seven taxonomies from the Yelp domain, and show that 1) RaTE\ncorrelates well with human judgments and 2) artificially degrading a taxonomy\nleads to decreasing RaTE score.\n","authors":["Tianjian Gao","Phillipe Langlais"],"pdf_url":"https://arxiv.org/pdf/2307.09706v1.pdf","comment":"15th International Conference on Computational Semantics (IWCS),\n Association for Computational Linguistics (ACL)"},{"id":"http://arxiv.org/abs/2307.03135v2","updated":"2023-07-19T01:28:30Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n Generalizability","summary":" Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Code released at\nhttps://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v2.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.09705v1","updated":"2023-07-19T01:22:40Z","published":"2023-07-19T01:22:40Z","title":"CValues: Measuring the Values of Chinese Large Language Models from\n Safety to Responsibility","summary":" With the rapid evolution of large language models (LLMs), there is a growing\nconcern that they may pose risks or have negative social impacts. Therefore,\nevaluation of human values alignment is becoming increasingly important.\nPrevious work mainly focuses on assessing the performance of LLMs on certain\nknowledge and reasoning abilities, while neglecting the alignment to human\nvalues, especially in a Chinese context. In this paper, we present CValues, the\nfirst Chinese human values evaluation benchmark to measure the alignment\nability of LLMs in terms of both safety and responsibility criteria. As a\nresult, we have manually collected adversarial safety prompts across 10\nscenarios and induced responsibility prompts from 8 domains by professional\nexperts. To provide a comprehensive values evaluation of Chinese LLMs, we not\nonly conduct human evaluation for reliable comparison, but also construct\nmulti-choice prompts for automatic evaluation. Our findings suggest that while\nmost Chinese LLMs perform well in terms of safety, there is considerable room\nfor improvement in terms of responsibility. Moreover, both the automatic and\nhuman evaluation are important for assessing the human values alignment in\ndifferent aspects. The benchmark and code is available on ModelScope and\nGithub.\n","authors":["Guohai Xu","Jiayi Liu","Ming Yan","Haotian Xu","Jinghui Si","Zhuoran Zhou","Peng Yi","Xing Gao","Jitao Sang","Rong Zhang","Ji Zhang","Chao Peng","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.09705v1.pdf","comment":"Working in Process"},{"id":"http://arxiv.org/abs/2307.09702v1","updated":"2023-07-19T01:14:49Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for LLMs","summary":" In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09701v1","updated":"2023-07-19T01:05:33Z","published":"2023-07-19T01:05:33Z","title":"Efficiency Pentathlon: A Standardized Arena for Efficiency Evaluation","summary":" Rising computational demands of modern natural language processing (NLP)\nsystems have increased the barrier to entry for cutting-edge research while\nposing serious environmental concerns. Yet, progress on model efficiency has\nbeen impeded by practical challenges in model evaluation and comparison. For\nexample, hardware is challenging to control due to disparate levels of\naccessibility across different institutions. Moreover, improvements in metrics\nsuch as FLOPs often fail to translate to progress in real-world applications.\nIn response, we introduce Pentathlon, a benchmark for holistic and realistic\nevaluation of model efficiency. Pentathlon focuses on inference, which accounts\nfor a majority of the compute in a model's lifecycle. It offers a\nstrictly-controlled hardware platform, and is designed to mirror real-world\napplications scenarios. It incorporates a suite of metrics that target\ndifferent aspects of efficiency, including latency, throughput, memory\noverhead, and energy consumption. Pentathlon also comes with a software library\nthat can be seamlessly integrated into any codebase and enable evaluation. As a\nstandardized and centralized evaluation platform, Pentathlon can drastically\nreduce the workload to make fair and reproducible efficiency comparisons. While\ninitially focused on natural language processing (NLP) models, Pentathlon is\ndesigned to allow flexible extension to other fields. We envision Pentathlon\nwill stimulate algorithmic innovations in building efficient models, and foster\nan increased awareness of the social and environmental implications in the\ndevelopment of future-generation NLP models.\n","authors":["Hao Peng","Qingqing Cao","Jesse Dodge","Matthew E. Peters","Jared Fernandez","Tom Sherborne","Kyle Lo","Sam Skjonsberg","Emma Strubell","Darrell Plessas","Iz Beltagy","Evan Pete Walsh","Noah A. Smith","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2307.09701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08272v2","updated":"2023-07-19T23:52:23Z","published":"2023-07-17T06:36:53Z","title":"ChatGPT is Good but Bing Chat is Better for Vietnamese Students","summary":" This study examines the efficacy of two SOTA large language models (LLMs),\nnamely ChatGPT and Microsoft Bing Chat (BingChat), in catering to the needs of\nVietnamese students. Although ChatGPT exhibits proficiency in multiple\ndisciplines, Bing Chat emerges as the more advantageous option. We conduct a\ncomparative analysis of their academic achievements in various disciplines,\nencompassing mathematics, literature, English language, physics, chemistry,\nbiology, history, geography, and civic education. The results of our study\nsuggest that BingChat demonstrates superior performance compared to ChatGPT\nacross a wide range of subjects, with the exception of literature, where\nChatGPT exhibits better performance. Additionally, BingChat utilizes the more\nadvanced GPT-4 technology in contrast to ChatGPT, which is built upon GPT-3.5.\nThis allows BingChat to improve to comprehension, reasoning and generation of\ncreative and informative text. Moreover, the fact that BingChat is accessible\nin Vietnam and its integration of hyperlinks and citations within responses\nserve to reinforce its superiority. In our analysis, it is evident that while\nChatGPT exhibits praiseworthy qualities, BingChat presents a more apdated\nsolutions for Vietnamese students.\n","authors":["Xuan-Quy Dao","Ngoc-Bich Le"],"pdf_url":"https://arxiv.org/pdf/2307.08272v2.pdf","comment":"13 pages; 6 figures"},{"id":"http://arxiv.org/abs/2307.10490v1","updated":"2023-07-19T23:03:20Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n Multi-Modal LLMs","summary":" We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10488v1","updated":"2023-07-19T22:48:02Z","published":"2023-07-19T22:48:02Z","title":"SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot\n Neural Sparse Retrieval","summary":" Traditionally, sparse retrieval systems relied on lexical representations to\nretrieve documents, such as BM25, dominated information retrieval tasks. With\nthe onset of pre-trained transformer models such as BERT, neural sparse\nretrieval has led to a new paradigm within retrieval. Despite the success,\nthere has been limited software supporting different sparse retrievers running\nin a unified, common environment. This hinders practitioners from fairly\ncomparing different sparse models and obtaining realistic evaluation results.\nAnother missing piece is, that a majority of prior work evaluates sparse\nretrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO.\nHowever, a key requirement in practical retrieval systems requires models that\ncan generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In\nthis work, we provide SPRINT, a unified Python toolkit based on Pyserini and\nLucene, supporting a common interface for evaluating neural sparse retrieval.\nThe toolkit currently includes five built-in models: uniCOIL, DeepImpact,\nSPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by\ndefining their term weighting method. Using our toolkit, we establish strong\nand reproducible zero-shot sparse retrieval baselines across the\nwell-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2\nachieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural\nsparse retrievers. In this work, we further uncover the reasons behind its\nperformance gain. We show that SPLADEv2 produces sparse representations with a\nmajority of tokens outside of the original query and document which is often\ncrucial for its performance gains, i.e. a limitation among its other sparse\ncounterparts. We provide our SPRINT toolkit, models, and data used in our\nexperiments publicly here at https://github.com/thakur-nandan/sprint.\n","authors":["Nandan Thakur","Kexin Wang","Iryna Gurevych","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2307.10488v1.pdf","comment":"Accepted at SIGIR 2023 (Resource Track)"},{"id":"http://arxiv.org/abs/2307.10485v1","updated":"2023-07-19T22:43:57Z","published":"2023-07-19T22:43:57Z","title":"FinGPT: Democratizing Internet-scale Data for Financial Large Language\n Models","summary":" Large language models (LLMs) have demonstrated remarkable proficiency in\nunderstanding and generating human-like texts, which may potentially\nrevolutionize the finance industry. However, existing LLMs often fall short in\nthe financial field, which is mainly attributed to the disparities between\ngeneral text data and financial text data. Unfortunately, there is only a\nlimited number of financial text datasets available (quite small size), and\nBloombergGPT, the first financial LLM (FinLLM), is close-sourced (only the\ntraining logs were released). In light of this, we aim to democratize\nInternet-scale financial data for LLMs, which is an open challenge due to\ndiverse data sources, low signal-to-noise ratio, and high time-validity. To\naddress the challenges, we introduce an open-sourced and data-centric\nframework, \\textit{Financial Generative Pre-trained Transformer (FinGPT)}, that\nautomates the collection and curation of real-time financial data from >34\ndiverse sources on the Internet, providing researchers and practitioners with\naccessible and transparent resources to develop their FinLLMs. Additionally, we\npropose a simple yet effective strategy for fine-tuning FinLLM using the\ninherent feedback from the market, dubbed Reinforcement Learning with Stock\nPrices (RLSP). We also adopt the Low-rank Adaptation (LoRA, QLoRA) method that\nenables users to customize their own FinLLMs from open-source general-purpose\nLLMs at a low cost. Finally, we showcase several FinGPT applications, including\nrobo-advisor, sentiment analysis for algorithmic trading, and low-code\ndevelopment. FinGPT aims to democratize FinLLMs, stimulate innovation, and\nunlock new opportunities in open finance. The codes are available at\nhttps://github.com/AI4Finance-Foundation/FinGPT and\nhttps://github.com/AI4Finance-Foundation/FinNLP\n","authors":["Xiao-Yang Liu","Guoxuan Wang","Daochen Zha"],"pdf_url":"https://arxiv.org/pdf/2307.10485v1.pdf","comment":"43 pages, 9 tables, and 3 figures"},{"id":"http://arxiv.org/abs/2307.10476v1","updated":"2023-07-19T22:14:58Z","published":"2023-07-19T22:14:58Z","title":"What can we learn from Data Leakage and Unlearning for Law?","summary":" Large Language Models (LLMs) have a privacy concern because they memorize\ntraining data (including personally identifiable information (PII) like emails\nand phone numbers) and leak it during inference. A company can train an LLM on\nits domain-customized data which can potentially also include their users' PII.\nIn order to comply with privacy laws such as the \"right to be forgotten\", the\ndata points of users that are most vulnerable to extraction could be deleted.\nWe find that once the most vulnerable points are deleted, a new set of points\nbecome vulnerable to extraction. So far, little attention has been given to\nunderstanding memorization for fine-tuned models. In this work, we also show\nthat not only do fine-tuned models leak their training data but they also leak\nthe pre-training data (and PII) memorized during the pre-training phase. The\nproperty of new data points becoming vulnerable to extraction after unlearning\nand leakage of pre-training data through fine-tuned models can pose significant\nprivacy and legal concerns for companies that use LLMs to offer services. We\nhope this work will start an interdisciplinary discussion within AI and law\ncommunities regarding the need for policies to tackle these issues.\n","authors":["Jaydeep Borkar"],"pdf_url":"https://arxiv.org/pdf/2307.10476v1.pdf","comment":"5 pages, 8 figures, accepted to the first GenLaw workshop at ICML'23,\n Hawai'i"},{"id":"http://arxiv.org/abs/2307.10475v1","updated":"2023-07-19T22:14:49Z","published":"2023-07-19T22:14:49Z","title":"Findings of Factify 2: Multimodal Fake News Detection","summary":" With social media usage growing exponentially in the past few years, fake\nnews has also become extremely prevalent. The detrimental impact of fake news\nemphasizes the need for research focused on automating the detection of false\ninformation and verifying its accuracy. In this work, we present the outcome of\nthe Factify 2 shared task, which provides a multi-modal fact verification and\nsatire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data\ncalls for a comparison based approach to the task by pairing social media\nclaims with supporting documents, with both text and image, divided into 5\nclasses based on multi-modal relations. In the second iteration of this task we\nhad over 60 participants and 9 final test-set submissions. The best\nperformances came from the use of DeBERTa for text and Swinv2 and CLIP for\nimage. The highest F1 score averaged for all five classes was 81.82%.\n","authors":["S Suryavardan","Shreyash Mishra","Megha Chakraborty","Parth Patwa","Anku Rani","Aman Chadha","Aishwarya Reganti","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.10475v1.pdf","comment":"Defactify2 @AAAI 2023"},{"id":"http://arxiv.org/abs/2307.10472v1","updated":"2023-07-19T22:03:40Z","published":"2023-07-19T22:03:40Z","title":"Can Instruction Fine-Tuned Language Models Identify Social Bias through\n Prompting?","summary":" As the breadth and depth of language model applications continue to expand\nrapidly, it is increasingly important to build efficient frameworks for\nmeasuring and mitigating the learned or inherited social biases of these\nmodels. In this paper, we present our work on evaluating instruction fine-tuned\nlanguage models' ability to identify bias through zero-shot prompting,\nincluding Chain-of-Thought (CoT) prompts. Across LLaMA and its two instruction\nfine-tuned versions, Alpaca 7B performs best on the bias identification task\nwith an accuracy of 56.7%. We also demonstrate that scaling up LLM size and\ndata diversity could lead to further performance gain. This is a\nwork-in-progress presenting the first component of our bias mitigation\nframework. We will keep updating this work as we get more results.\n","authors":["Omkar Dige","Jacob-Junqi Tian","David Emerson","Faiza Khan Khattak"],"pdf_url":"https://arxiv.org/pdf/2307.10472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10457v1","updated":"2023-07-19T21:00:16Z","published":"2023-07-19T21:00:16Z","title":"Improving Pre-trained Language Models' Generalization","summary":" The reusability of state-of-the-art Pre-trained Language Models (PLMs) is\noften limited by their generalization problem, where their performance\ndrastically decreases when evaluated on examples that differ from the training\ndataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation\narises from PLMs' reliance on spurious correlations, which work well for\nfrequent example types but not for general examples. To address this issue, we\npropose a training approach called Mask-tuning, which integrates Masked\nLanguage Modeling (MLM) training objectives into the fine-tuning process to\nenhance PLMs' generalization. Comprehensive experiments demonstrate that\nMask-tuning surpasses current state-of-the-art techniques and enhances PLMs'\ngeneralization on OOD datasets while improving their performance on\nin-distribution datasets. The findings suggest that Mask-tuning improves the\nreusability of PLMs on unseen data, making them more practical and effective\nfor real-world applications.\n","authors":["Somayeh Ghanbarzadeh","Hamid Palangi","Yan Huang","Radames Cruz Moreno","Hamed Khanpour"],"pdf_url":"https://arxiv.org/pdf/2307.10457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10443v1","updated":"2023-07-19T20:17:37Z","published":"2023-07-19T20:17:37Z","title":"Integrating a Heterogeneous Graph with Entity-aware Self-attention using\n Relative Position Labels for Reading Comprehension Model","summary":" Despite the significant progress made by transformer models in machine\nreading comprehension tasks, they still face limitations in handling complex\nreasoning tasks due to the absence of explicit knowledge in the input sequence.\nThis paper proposes a novel attention pattern to overcome this limitation,\nwhich integrates reasoning knowledge derived from a heterogeneous graph into\nthe transformer architecture using a graph-enhanced self-attention mechanism.\nThe proposed attention pattern comprises three key elements: global-local\nattention for word tokens, graph attention for entity tokens that exhibit\nstrong attention towards tokens connected in the graph as opposed to those\nunconnected, and the consideration of the type of relationship between each\nentity token and word token. This results in optimized attention between the\ntwo if a relationship exists. The pattern is coupled with special relative\nposition labels, allowing it to integrate with LUKE's entity-aware\nself-attention mechanism. The experimental findings corroborate that our model\noutperforms both the cutting-edge LUKE-Graph and the baseline LUKE model on the\nReCoRD dataset that focuses on commonsense reasoning.\n","authors":["Shima Foolad","Kourosh Kiani"],"pdf_url":"https://arxiv.org/pdf/2307.10443v1.pdf","comment":"submitted for Knowledge-Based Systems Journal"},{"id":"http://arxiv.org/abs/2307.10442v1","updated":"2023-07-19T20:16:46Z","published":"2023-07-19T20:16:46Z","title":"Thrust: Adaptively Propels Large Language Models with External Knowledge","summary":" Although large-scale pre-trained language models (PTLMs) are shown to encode\nrich knowledge in their model parameters, the inherent knowledge in PTLMs can\nbe opaque or static, making external knowledge necessary. However, the existing\ninformation retrieval techniques could be costly and may even introduce noisy\nand sometimes misleading knowledge. To address these challenges, we propose the\ninstance-level adaptive propulsion of external knowledge (IAPEK), where we only\nconduct the retrieval when necessary. To achieve this goal, we propose\nmeasuring whether a PTLM contains enough knowledge to solve an instance with a\nnovel metric, Thrust, which leverages the representation distribution of a\nsmall number of seen instances. Extensive experiments demonstrate that thrust\nis a good measurement of PTLM models' instance-level knowledgeability.\nMoreover, we can achieve significantly higher cost-efficiency with the Thrust\nscore as the retrieval indicator than the naive usage of external knowledge on\n88% of the evaluated tasks with 26% average performance improvement. Such\nfindings shed light on the real-world practice of knowledge-enhanced LMs with a\nlimited knowledge-seeking budget due to computation latency or costs.\n","authors":["Xinran Zhao","Hongming Zhang","Xiaoman Pan","Wenlin Yao","Dong Yu","Jianshu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.10442v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2301.13816v4","updated":"2023-07-19T19:55:31Z","published":"2023-01-31T18:02:26Z","title":"Execution-based Code Generation using Deep Reinforcement Learning","summary":" The utilization of programming language (PL) models, pre-trained on\nlarge-scale code corpora, as a means of automating software engineering\nprocesses has demonstrated considerable potential in streamlining various code\ngeneration tasks such as code completion, code translation, and program\nsynthesis. However, current approaches mainly rely on supervised fine-tuning\nobjectives borrowed from text generation, neglecting unique sequence-level\ncharacteristics of code, including but not limited to compilability as well as\nsyntactic and functional correctness. To address this limitation, we propose\nPPOCoder, a new framework for code generation that synergistically combines\npre-trained PL models with Proximal Policy Optimization (PPO) which is a widely\nused deep reinforcement learning technique. By utilizing non-differentiable\nfeedback from code execution and structure alignment, PPOCoder seamlessly\nintegrates external code-specific knowledge into the model optimization\nprocess. It's important to note that PPOCoder is a task-agnostic and\nmodel-agnostic framework that can be used across different code generation\ntasks and PLs. Extensive experiments on three code generation tasks demonstrate\nthe effectiveness of our proposed approach compared to SOTA methods, achieving\nsignificant improvements in compilation success rates and functional\ncorrectness across different PLs.\n","authors":["Parshin Shojaee","Aneesh Jain","Sindhu Tipirneni","Chandan K. Reddy"],"pdf_url":"https://arxiv.org/pdf/2301.13816v4.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR), 2023"},{"id":"http://arxiv.org/abs/2307.10432v1","updated":"2023-07-19T19:40:34Z","published":"2023-07-19T19:40:34Z","title":"PharmacyGPT: The AI Pharmacist","summary":" In this study, we introduce PharmacyGPT, a novel framework to assess the\ncapabilities of large language models (LLMs) such as ChatGPT and GPT-4 in\nemulating the role of clinical pharmacists. Our methodology encompasses the\nutilization of LLMs to generate comprehensible patient clusters, formulate\nmedication plans, and forecast patient outcomes. We conduct our investigation\nusing real data acquired from the intensive care unit (ICU) at the University\nof North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable\ninsights into the potential applications and limitations of LLMs in the field\nof clinical pharmacy, with implications for both patient care and the\ndevelopment of future AI-driven healthcare solutions. By evaluating the\nperformance of PharmacyGPT, we aim to contribute to the ongoing discourse\nsurrounding the integration of artificial intelligence in healthcare settings,\nultimately promoting the responsible and efficacious use of such technologies.\n","authors":["Zhengliang Liu","Zihao Wu","Mengxuan Hu","Bokai Zhao","Lin Zhao","Tianyi Zhang","Haixing Dai","Xianyan Chen","Ye Shen","Sheng Li","Brian Murray","Tianming Liu","Andrea Sikora"],"pdf_url":"https://arxiv.org/pdf/2307.10432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09826v2","updated":"2023-07-19T19:30:52Z","published":"2023-04-16T11:22:59Z","title":"Fairness in AI and Its Long-Term Implications on Society","summary":" Successful deployment of artificial intelligence (AI) in various settings has\nled to numerous positive outcomes for individuals and society. However, AI\nsystems have also been shown to harm parts of the population due to biased\npredictions. AI fairness focuses on mitigating such biases to ensure AI\ndecision making is not discriminatory towards certain groups. We take a closer\nlook at AI fairness and analyze how lack of AI fairness can lead to deepening\nof biases over time and act as a social stressor. More specifically, we discuss\nhow biased models can lead to more negative real-world outcomes for certain\ngroups, which may then become more prevalent by deploying new AI models trained\non increasingly biased data, resulting in a feedback loop. If the issues\npersist, they could be reinforced by interactions with other risks and have\nsevere implications on society in the form of social unrest. We examine current\nstrategies for improving AI fairness, assess their limitations in terms of\nreal-world deployment, and explore potential paths forward to ensure we reap\nAI's benefits without causing society's collapse.\n","authors":["Ondrej Bohdal","Timothy Hospedales","Philip H. S. Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2304.09826v2.pdf","comment":"Stanford Existential Risks Conference 2023"},{"id":"http://arxiv.org/abs/2306.17582v2","updated":"2023-07-19T19:30:28Z","published":"2023-02-20T06:39:06Z","title":"ChatGPT for Robotics: Design Principles and Model Abilities","summary":" This paper presents an experimental study regarding the use of OpenAI's\nChatGPT for robotics applications. We outline a strategy that combines design\nprinciples for prompt engineering and the creation of a high-level function\nlibrary which allows ChatGPT to adapt to different robotics tasks, simulators,\nand form factors. We focus our evaluations on the effectiveness of different\nprompt engineering techniques and dialog strategies towards the execution of\nvarious types of robotics tasks. We explore ChatGPT's ability to use free-form\ndialog, parse XML tags, and to synthesize code, in addition to the use of\ntask-specific prompting functions and closed-loop reasoning through dialogues.\nOur study encompasses a range of tasks within the robotics domain, from basic\nlogical, geometrical, and mathematical reasoning all the way to complex domains\nsuch as aerial navigation, manipulation, and embodied agents. We show that\nChatGPT can be effective at solving several of such tasks, while allowing users\nto interact with it primarily via natural language instructions. In addition to\nthese studies, we introduce an open-sourced research tool called PromptCraft,\nwhich contains a platform where researchers can collaboratively upload and vote\non examples of good prompting schemes for robotics applications, as well as a\nsample robotics simulator with ChatGPT integration, making it easier for users\nto get started with using ChatGPT for robotics.\n","authors":["Sai Vemprala","Rogerio Bonatti","Arthur Bucker","Ashish Kapoor"],"pdf_url":"https://arxiv.org/pdf/2306.17582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10323v1","updated":"2023-07-19T07:20:30Z","published":"2023-07-19T07:20:30Z","title":"IncDSI: Incrementally Updatable Document Retrieval","summary":" Differentiable Search Index is a recently proposed paradigm for document\nretrieval, that encodes information about a corpus of documents within the\nparameters of a neural network and directly maps queries to corresponding\ndocuments. These models have achieved state-of-the-art performances for\ndocument retrieval across many benchmarks. These kinds of models have a\nsignificant limitation: it is not easy to add new documents after a model is\ntrained. We propose IncDSI, a method to add documents in real time (about\n20-50ms per document), without retraining the model on the entire dataset (or\neven parts thereof). Instead we formulate the addition of documents as a\nconstrained optimization problem that makes minimal changes to the network\nparameters. Although orders of magnitude faster, our approach is competitive\nwith re-training the model on the whole dataset and enables the development of\ndocument retrieval systems that can be updated with new information in\nreal-time. Our code for IncDSI is available at\nhttps://github.com/varshakishore/IncDSI.\n","authors":["Varsha Kishore","Chao Wan","Justin Lovelace","Yoav Artzi","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2307.10323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00370v2","updated":"2023-07-19T06:55:04Z","published":"2023-07-01T15:44:53Z","title":"Improving Text Matching in E-Commerce Search with A Rationalizable,\n Intervenable and Fast Entity-Based Relevance Model","summary":" Discovering the intended items of user queries from a massive repository of\nitems is one of the main goals of an e-commerce search system. Relevance\nprediction is essential to the search system since it helps improve\nperformance. When online serving a relevance model, the model is required to\nperform fast and accurate inference. Currently, the widely used models such as\nBi-encoder and Cross-encoder have their limitations in accuracy or inference\nspeed respectively. In this work, we propose a novel model called the\nEntity-Based Relevance Model (EBRM). We identify the entities contained in an\nitem and decompose the QI (query-item) relevance problem into multiple QE\n(query-entity) relevance problems; we then aggregate their results to form the\nQI prediction using a soft logic formulation. The decomposition allows us to\nuse a Cross-encoder QE relevance module for high accuracy as well as cache QE\npredictions for fast online inference. Utilizing soft logic makes the\nprediction procedure interpretable and intervenable. We also show that\npretraining the QE module with auto-generated QE data from user logs can\nfurther improve the overall performance. The proposed method is evaluated on\nlabeled data from e-commerce websites. Empirical results show that it achieves\npromising improvements with computation efficiency.\n","authors":["Jiong Cai","Yong Jiang","Yue Zhang","Chengyue Jiang","Ke Yu","Jianhui Ji","Rong Xiao","Haihong Tang","Tao Wang","Zhongqiang Huang","Pengjun Xie","Fei Huang","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2307.00370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10314v1","updated":"2023-07-19T03:31:41Z","published":"2023-07-19T03:31:41Z","title":"Mood Classification of Bangla Songs Based on Lyrics","summary":" Music can evoke various emotions, and with the advancement of technology, it\nhas become more accessible to people. Bangla music, which portrays different\nhuman emotions, lacks sufficient research. The authors of this article aim to\nanalyze Bangla songs and classify their moods based on the lyrics. To achieve\nthis, this research has compiled a dataset of 4000 Bangla song lyrics, genres,\nand used Natural Language Processing and the Bert Algorithm to analyze the\ndata. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362\nfor the romantic mood, 886 for happiness, and the rest 239 are classified as\nrelaxation. By embedding the lyrics of the songs, the authors have classified\nthe songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is\ncrucial as it enables a multi-class classification of songs' moods, making the\nmusic more relatable to people's emotions. The article presents the automated\nresult of the four moods accurately derived from the song lyrics.\n","authors":["Maliha Mahajebin","Mohammad Rifat Ahmmad Rashid","Nafees Mansoor"],"pdf_url":"https://arxiv.org/pdf/2307.10314v1.pdf","comment":"Presented at International Conference on. Inventive Communication and\n Computational Technologies 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.10173v1","updated":"2023-07-19T17:58:03Z","published":"2023-07-19T17:58:03Z","title":"DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity\n Human-centric Rendering","summary":" Realistic human-centric rendering plays a key role in both computer vision\nand computer graphics. Rapid progress has been made in the algorithm aspect\nover the years, yet existing human-centric rendering datasets and benchmarks\nare rather impoverished in terms of diversity, which are crucial for rendering\neffect. Researchers are usually constrained to explore and evaluate a small set\nof rendering problems on current datasets, while real-world applications\nrequire methods to be robust across different scenarios. In this work, we\npresent DNA-Rendering, a large-scale, high-fidelity repository of human\nperformance data for neural actor rendering. DNA-Rendering presents several\nalluring attributes. First, our dataset contains over 1500 human subjects, 5000\nmotion sequences, and 67.5M frames' data volume. Second, we provide rich assets\nfor each subject -- 2D/3D human body keypoints, foreground masks, SMPLX models,\ncloth/accessory materials, multi-view images, and videos. These assets boost\nthe current method's accuracy on downstream rendering tasks. Third, we\nconstruct a professional multi-view system to capture data, which contains 60\nsynchronous cameras with max 4096 x 3000 resolution, 15 fps speed, and stern\ncamera calibration steps, ensuring high-quality resources for task training and\nevaluation. Along with the dataset, we provide a large-scale and quantitative\nbenchmark in full-scale, with multiple tasks to evaluate the existing progress\nof novel view synthesis, novel pose animation synthesis, and novel identity\nrendering methods. In this manuscript, we describe our DNA-Rendering effort as\na revealing of new observations, challenges, and future directions to\nhuman-centric rendering. The dataset, code, and benchmarks will be publicly\navailable at https://dna-rendering.github.io/\n","authors":["Wei Cheng","Ruixiang Chen","Wanqi Yin","Siming Fan","Keyu Chen","Honglin He","Huiwen Luo","Zhongang Cai","Jingbo Wang","Yang Gao","Zhengming Yu","Zhengyu Lin","Daxuan Ren","Lei Yang","Ziwei Liu","Chen Change Loy","Chen Qian","Wayne Wu","Dahua Lin","Bo Dai","Kwan-Yee Lin"],"pdf_url":"https://arxiv.org/pdf/2307.10173v1.pdf","comment":"This paper is accepted by ICCV2023. Project page:\n https://dna-rendering.github.io/"},{"id":"http://arxiv.org/abs/2112.06809v8","updated":"2023-07-19T17:50:21Z","published":"2021-12-13T17:11:32Z","title":"Persistent Animal Identification Leveraging Non-Visual Markers","summary":" Our objective is to locate and provide a unique identifier for each mouse in\na cluttered home-cage environment through time, as a precursor to automated\nbehaviour recognition for biological research. This is a very challenging\nproblem due to (i) the lack of distinguishing visual features for each mouse,\nand (ii) the close confines of the scene with constant occlusion, making\nstandard visual tracking approaches unusable. However, a coarse estimate of\neach mouse's location is available from a unique RFID implant, so there is the\npotential to optimally combine information from (weak) tracking with coarse\ninformation on identity. To achieve our objective, we make the following key\ncontributions: (a) the formulation of the object identification problem as an\nassignment problem (solved using Integer Linear Programming), and (b) a novel\nprobabilistic model of the affinity between tracklets and RFID data. The latter\nis a crucial part of the model, as it provides a principled probabilistic\ntreatment of object detections given coarse localisation. Our approach achieves\n77% accuracy on this animal identification problem, and is able to reject\nspurious detections when the animals are hidden.\n","authors":["Michael P. J. Camilleri","Li Zhang","Rasneer S. Bains","Andrew Zisserman","Christopher K. I. Williams"],"pdf_url":"https://arxiv.org/pdf/2112.06809v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10166v1","updated":"2023-07-19T17:50:03Z","published":"2023-07-19T17:50:03Z","title":"Adversarial Latent Autoencoder with Self-Attention for Structural Image\n Synthesis","summary":" Generative Engineering Design approaches driven by Deep Generative Models\n(DGM) have been proposed to facilitate industrial engineering processes. In\nsuch processes, designs often come in the form of images, such as blueprints,\nengineering drawings, and CAD models depending on the level of detail. DGMs\nhave been successfully employed for synthesis of natural images, e.g.,\ndisplaying animals, human faces and landscapes. However, industrial design\nimages are fundamentally different from natural scenes in that they contain\nrich structural patterns and long-range dependencies, which are challenging for\nconvolution-based DGMs to generate. Moreover, DGM-driven generation process is\ntypically triggered based on random noisy inputs, which outputs unpredictable\nsamples and thus cannot perform an efficient industrial design exploration. We\ntackle these challenges by proposing a novel model Self-Attention Adversarial\nLatent Autoencoder (SA-ALAE), which allows generating feasible design images of\ncomplex engineering parts. With SA-ALAE, users can not only explore novel\nvariants of an existing design, but also control the generation process by\noperating in latent space. The potential of SA-ALAE is shown by generating\nengineering blueprints in a real automotive design task.\n","authors":["Jiajie Fan","Laure Vuaille","Hao Wang","Thomas Bäck"],"pdf_url":"https://arxiv.org/pdf/2307.10166v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.10165v1","updated":"2023-07-19T17:46:55Z","published":"2023-07-19T17:46:55Z","title":"Drone navigation and license place detection for vehicle location in\n indoor spaces","summary":" Millions of vehicles are transported every year, tightly parked in vessels or\nboats. To reduce the risks of associated safety issues like fires, knowing the\nlocation of vehicles is essential, since different vehicles may need different\nmitigation measures, e.g. electric cars. This work is aimed at creating a\nsolution based on a nano-drone that navigates across rows of parked vehicles\nand detects their license plates. We do so via a wall-following algorithm, and\na CNN trained to detect license plates. All computations are done in real-time\non the drone, which just sends position and detected images that allow the\ncreation of a 2D map with the position of the plates. Our solution is capable\nof reading all plates across eight test cases (with several rows of plates,\ndifferent drone speeds, or low light) by aggregation of measurements across\nseveral drone journeys.\n","authors":["Moa Arvidsson","Sithichot Sawirot","Cristofer Englund","Fernando Alonso-Fernandez","Martin Torstensson","Boris Duran"],"pdf_url":"https://arxiv.org/pdf/2307.10165v1.pdf","comment":"Published at VIII International Workshop on Artificial Intelligence\n and Pattern Recognition, IWAIPR"},{"id":"http://arxiv.org/abs/2307.10160v1","updated":"2023-07-19T17:42:36Z","published":"2023-07-19T17:42:36Z","title":"Robust Driving Policy Learning with Guided Meta Reinforcement Learning","summary":" Although deep reinforcement learning (DRL) has shown promising results for\nautonomous navigation in interactive traffic scenarios, existing work typically\nadopts a fixed behavior policy to control social vehicles in the training\nenvironment. This may cause the learned driving policy to overfit the\nenvironment, making it difficult to interact well with vehicles with different,\nunseen behaviors. In this work, we introduce an efficient method to train\ndiverse driving policies for social vehicles as a single meta-policy. By\nrandomizing the interaction-based reward functions of social vehicles, we can\ngenerate diverse objectives and efficiently train the meta-policy through\nguiding policies that achieve specific objectives. We further propose a\ntraining strategy to enhance the robustness of the ego vehicle's driving policy\nusing the environment where social vehicles are controlled by the learned\nmeta-policy. Our method successfully learns an ego driving policy that\ngeneralizes well to unseen situations with out-of-distribution (OOD) social\nagents' behaviors in a challenging uncontrolled T-intersection scenario.\n","authors":["Kanghoon Lee","Jiachen Li","David Isele","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2307.10160v1.pdf","comment":"ITSC 2023"},{"id":"http://arxiv.org/abs/2307.10159v1","updated":"2023-07-19T17:39:39Z","published":"2023-07-19T17:39:39Z","title":"FABRIC: Personalizing Diffusion Models with Iterative Feedback","summary":" In an era where visual content generation is increasingly driven by machine\nlearning, the integration of human feedback into generative models presents\nsignificant opportunities for enhancing user experience and output quality.\nThis study explores strategies for incorporating iterative human feedback into\nthe generative process of diffusion-based text-to-image models. We propose\nFABRIC, a training-free approach applicable to a wide range of popular\ndiffusion models, which exploits the self-attention layer present in the most\nwidely used architectures to condition the diffusion process on a set of\nfeedback images. To ensure a rigorous assessment of our approach, we introduce\na comprehensive evaluation methodology, offering a robust mechanism to quantify\nthe performance of generative visual models that integrate human feedback. We\nshow that generation results improve over multiple rounds of iterative feedback\nthrough exhaustive analysis, implicitly optimizing arbitrary user preferences.\nThe potential applications of these findings extend to fields such as\npersonalized content creation and customization.\n","authors":["Dimitri von Rütte","Elisabetta Fedele","Jonathan Thomm","Lukas Wolf"],"pdf_url":"https://arxiv.org/pdf/2307.10159v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.10157v1","updated":"2023-07-19T17:38:26Z","published":"2023-07-19T17:38:26Z","title":"Leveraging Visemes for Better Visual Speech Representation and Lip\n Reading","summary":" Lip reading is a challenging task that has many potential applications in\nspeech recognition, human-computer interaction, and security systems. However,\nexisting lip reading systems often suffer from low accuracy due to the\nlimitations of video features. In this paper, we propose a novel approach that\nleverages visemes, which are groups of phonetically similar lip shapes, to\nextract more discriminative and robust video features for lip reading. We\nevaluate our approach on various tasks, including word-level and sentence-level\nlip reading, and audiovisual speech recognition using the Arman-AV dataset, a\nlargescale Persian corpus. Our experimental results show that our viseme based\napproach consistently outperforms the state-of-theart methods in all these\ntasks. The proposed method reduces the lip-reading word error rate (WER) by\n9.1% relative to the best previous method.\n","authors":["Javad Peymanfard","Vahid Saeedi","Mohammad Reza Mohammadi","Hossein Zeinali","Nasser Mozayani"],"pdf_url":"https://arxiv.org/pdf/2307.10157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10135v1","updated":"2023-07-19T17:00:45Z","published":"2023-07-19T17:00:45Z","title":"An Improved NeuMIP with Better Accuracy","summary":" Neural reflectance models are capable of accurately reproducing the\nspatially-varying appearance of many real-world materials at different scales.\nHowever, existing methods have difficulties handling highly glossy materials.\nTo address this problem, we introduce a new neural reflectance model which,\ncompared with existing methods, better preserves not only specular highlights\nbut also fine-grained details. To this end, we enhance the neural network\nperformance by encoding input data to frequency space, inspired by NeRF, to\nbetter preserve the details. Furthermore, we introduce a gradient-based loss\nand employ it in multiple stages, adaptive to the progress of the learning\nphase. Lastly, we utilize an optional extension to the decoder network using\nthe Inception module for more accurate yet costly performance. We demonstrate\nthe effectiveness of our method using a variety of synthetic and real examples.\n","authors":["Bowen Xue","Shuang Zhao","Henrik Wann Jensen","Zahra Montazeri"],"pdf_url":"https://arxiv.org/pdf/2307.10135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10129v1","updated":"2023-07-19T16:51:59Z","published":"2023-07-19T16:51:59Z","title":"General vs. Long-Tailed Age Estimation: An Approach to Kill Two Birds\n with One Stone","summary":" Facial age estimation has received a lot of attention for its diverse\napplication scenarios. Most existing studies treat each sample equally and aim\nto reduce the average estimation error for the entire dataset, which can be\nsummarized as General Age Estimation. However, due to the long-tailed\ndistribution prevalent in the dataset, treating all samples equally will\ninevitably bias the model toward the head classes (usually the adult with a\nmajority of samples). Driven by this, some works suggest that each class should\nbe treated equally to improve performance in tail classes (with a minority of\nsamples), which can be summarized as Long-tailed Age Estimation. However,\nLong-tailed Age Estimation usually faces a performance trade-off, i.e.,\nachieving improvement in tail classes by sacrificing the head classes. In this\npaper, our goal is to design a unified framework to perform well on both tasks,\nkilling two birds with one stone. To this end, we propose a simple, effective,\nand flexible training paradigm named GLAE, which is two-fold. Our GLAE provides\na surprising improvement on Morph II, reaching the lowest MAE and CMAE of 1.14\nand 1.27 years, respectively. Compared to the previous best method, MAE dropped\nby up to 34%, which is an unprecedented improvement, and for the first time,\nMAE is close to 1 year old. Extensive experiments on other age benchmark\ndatasets, including CACD, MIVIA, and Chalearn LAP 2015, also indicate that GLAE\noutperforms the state-of-the-art approaches significantly.\n","authors":["Zenghao Bao","Zichang Tan","Jun Li","Jun Wan","Xibo Ma","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2307.10129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10123v1","updated":"2023-07-19T16:42:52Z","published":"2023-07-19T16:42:52Z","title":"Two Approaches to Supervised Image Segmentation","summary":" Though performed almost effortlessly by humans, segmenting 2D gray-scale or\ncolor images in terms of their constituent regions of interest\n(e.g.~background, objects or portions of objects) constitutes one of the\ngreatest challenges in science and technology as a consequence of the involved\ndimensionality reduction(3D to 2D), noise, reflections, shades, and occlusions,\namong many other possible effects. While a large number of interesting\napproaches have been respectively suggested along the last decades, it was\nmainly with the more recent development of deep learning that more effective\nand general solutions have been obtained, currently constituting the basic\ncomparison reference for this type of operation. Also developed recently, a\nmultiset-based methodology has been described that is capable of encouraging\nperformance that combines spatial accuracy, stability, and robustness while\nrequiring minimal computational resources (hardware and/or training and\nrecognition time). The interesting features of the latter methodology mostly\nfollow from the enhanced selectivity and sensitivity, as well as good\nrobustness to data perturbations and outliers, allowed by the coincidence\nsimilarity index on which the multiset approach to supervised image\nsegmentation is based. After describing the deep learning and multiset\napproaches, the present work develops two comparison experiments between them\nwhich are primarily aimed at illustrating their respective main interesting\nfeatures when applied to the adopted specific type of data and parameter\nconfigurations. While the deep learning approach confirmed its potential for\nperforming image segmentation, the alternative multiset methodology allowed for\nencouraging accuracy while requiring little computational resources.\n","authors":["Alexandre Benatti","Luciano da F. Costa"],"pdf_url":"https://arxiv.org/pdf/2307.10123v1.pdf","comment":"37 pages, 18 figures"},{"id":"http://arxiv.org/abs/2103.03328v3","updated":"2023-07-19T16:19:53Z","published":"2021-03-04T20:58:22Z","title":"Evaluation of Complexity Measures for Deep Learning Generalization in\n Medical Image Analysis","summary":" The generalization performance of deep learning models for medical image\nanalysis often decreases on images collected with different devices for data\nacquisition, device settings, or patient population. A better understanding of\nthe generalization capacity on new images is crucial for clinicians'\ntrustworthiness in deep learning. Although significant research efforts have\nbeen recently directed toward establishing generalization bounds and complexity\nmeasures, still, there is often a significant discrepancy between the predicted\nand actual generalization performance. As well, related large empirical studies\nhave been primarily based on validation with general-purpose image datasets.\nThis paper presents an empirical study that investigates the correlation\nbetween 25 complexity measures and the generalization abilities of supervised\ndeep learning classifiers for breast ultrasound images. The results indicate\nthat PAC-Bayes flatness-based and path norm-based measures produce the most\nconsistent explanation for the combination of models and data. We also\ninvestigate the use of multi-task classification and segmentation approach for\nbreast images, and report that such learning approach acts as an implicit\nregularizer and is conducive toward improved generalization.\n","authors":["Aleksandar Vakanski","Min Xian"],"pdf_url":"https://arxiv.org/pdf/2103.03328v3.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.10097v1","updated":"2023-07-19T16:12:37Z","published":"2023-07-19T16:12:37Z","title":"Boundary-Refined Prototype Generation: A General End-to-End Paradigm for\n Semi-Supervised Semantic Segmentation","summary":" Prototype-based classification is a classical method in machine learning, and\nrecently it has achieved remarkable success in semi-supervised semantic\nsegmentation. However, the current approach isolates the prototype\ninitialization process from the main training framework, which appears to be\nunnecessary. Furthermore, while the direct use of K-Means algorithm for\nprototype generation has considered rich intra-class variance, it may not be\nthe optimal solution for the classification task. To tackle these problems, we\npropose a novel boundary-refined prototype generation (BRPG) method, which is\nincorporated into the whole training framework. Specifically, our approach\nsamples and clusters high- and low-confidence features separately based on a\nconfidence threshold, aiming to generate prototypes closer to the class\nboundaries. Moreover, an adaptive prototype optimization strategy is introduced\nto make prototype augmentation for categories with scattered feature\ndistributions. Extensive experiments on the PASCAL VOC 2012 and Cityscapes\ndatasets demonstrate the superiority and scalability of the proposed method,\noutperforming the current state-of-the-art approaches. The code is available at\nxxxxxxxxxxxxxx.\n","authors":["Junhao Dong","Zhu Meng","Delong Liu","Zhicheng Zhao","Fei Su"],"pdf_url":"https://arxiv.org/pdf/2307.10097v1.pdf","comment":"53 pages, 7 figures"},{"id":"http://arxiv.org/abs/2303.13479v2","updated":"2023-07-19T16:11:13Z","published":"2023-03-23T17:48:12Z","title":"IST-Net: Prior-free Category-level Pose Estimation with Implicit Space\n Transformation","summary":" Category-level 6D pose estimation aims to predict the poses and sizes of\nunseen objects from a specific category. Thanks to prior deformation, which\nexplicitly adapts a category-specific 3D prior (i.e., a 3D template) to a given\nobject instance, prior-based methods attained great success and have become a\nmajor research stream. However, obtaining category-specific priors requires\ncollecting a large amount of 3D models, which is labor-consuming and often not\naccessible in practice. This motivates us to investigate whether priors are\nnecessary to make prior-based methods effective. Our empirical study shows that\nthe 3D prior itself is not the credit to the high performance. The keypoint\nactually is the explicit deformation process, which aligns camera and world\ncoordinates supervised by world-space 3D models (also called canonical space).\nInspired by these observations, we introduce a simple prior-free implicit space\ntransformation network, namely IST-Net, to transform camera-space features to\nworld-space counterparts and build correspondence between them in an implicit\nmanner without relying on 3D priors. Besides, we design camera- and world-space\nenhancers to enrich the features with pose-sensitive information and\ngeometrical constraints, respectively. Albeit simple, IST-Net achieves\nstate-of-the-art performance based-on prior-free design, with top inference\nspeed on the REAL275 benchmark. Our code and models are available at\nhttps://github.com/CVMI-Lab/IST-Net.\n","authors":["Jianhui Liu","Yukang Chen","Xiaoqing Ye","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2303.13479v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.10094v1","updated":"2023-07-19T16:01:09Z","published":"2023-07-19T16:01:09Z","title":"Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D\n Brain MRI Synthesis","summary":" Cross-modality medical image synthesis is a critical topic and has the\npotential to facilitate numerous applications in the medical imaging field.\nDespite recent successes in deep-learning-based generative models, most current\nmedical image synthesis methods rely on generative adversarial networks and\nsuffer from notorious mode collapse and unstable training. Moreover, the 2D\nbackbone-driven approaches would easily result in volumetric inconsistency,\nwhile 3D backbones are challenging and impractical due to the tremendous memory\ncost and training difficulty. In this paper, we introduce a new paradigm for\nvolumetric medical data synthesis by leveraging 2D backbones and present a\ndiffusion-based framework, Make-A-Volume, for cross-modality 3D medical image\nsynthesis. To learn the cross-modality slice-wise mapping, we employ a latent\ndiffusion model and learn a low-dimensional latent space, resulting in high\ncomputational efficiency. To enable the 3D image synthesis and mitigate\nvolumetric inconsistency, we further insert a series of volumetric layers in\nthe 2D slice-mapping model and fine-tune them with paired 3D data. This\nparadigm extends the 2D image diffusion model to a volumetric version with a\nslightly increasing number of parameters and computation, offering a principled\nsolution for generic cross-modality 3D medical image synthesis. We showcase the\neffectiveness of our Make-A-Volume framework on an in-house SWI-MRA brain MRI\ndataset and a public T1-T2 brain MRI dataset. Experimental results demonstrate\nthat our framework achieves superior synthesis results with volumetric\nconsistency.\n","authors":["Lingting Zhu","Zeyue Xue","Zhenchao Jin","Xian Liu","Jingzhen He","Ziwei Liu","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2307.10094v1.pdf","comment":"Accepted by International Conference on Medical Image Computing and\n Computer Assisted Intervention (MICCAI 2023). 10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2207.00419v3","updated":"2023-07-19T16:00:08Z","published":"2022-06-18T00:26:52Z","title":"Self-Supervised Learning for Videos: A Survey","summary":" The remarkable success of deep learning in various domains relies on the\navailability of large-scale annotated datasets. However, obtaining annotations\nis expensive and requires great effort, which is especially challenging for\nvideos. Moreover, the use of human-generated annotations leads to models with\nbiased learning and poor domain generalization and robustness. As an\nalternative, self-supervised learning provides a way for representation\nlearning which does not require annotations and has shown promise in both image\nand video domains. Different from the image domain, learning video\nrepresentations are more challenging due to the temporal dimension, bringing in\nmotion and other environmental dynamics. This also provides opportunities for\nvideo-exclusive ideas that advance self-supervised learning in the video and\nmultimodal domain. In this survey, we provide a review of existing approaches\non self-supervised learning focusing on the video domain. We summarize these\nmethods into four different categories based on their learning objectives: 1)\npretext tasks, 2) generative learning, 3) contrastive learning, and 4)\ncross-modal agreement. We further introduce the commonly used datasets,\ndownstream evaluation tasks, insights into the limitations of existing works,\nand the potential future directions in this area.\n","authors":["Madeline C. Schiappa","Yogesh S. Rawat","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2207.00419v3.pdf","comment":"ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q"},{"id":"http://arxiv.org/abs/2307.04838v2","updated":"2023-07-19T15:59:03Z","published":"2023-07-10T18:15:03Z","title":"CREPE: Learnable Prompting With CLIP Improves Visual Relationship\n Prediction","summary":" In this paper, we explore the potential of Vision-Language Models (VLMs),\nspecifically CLIP, in predicting visual object relationships, which involves\ninterpreting visual features from images into language-based relations. Current\nstate-of-the-art methods use complex graphical models that utilize language\ncues and visual features to address this challenge. We hypothesize that the\nstrong language priors in CLIP embeddings can simplify these graphical models\npaving for a simpler approach. We adopt the UVTransE relation prediction\nframework, which learns the relation as a translational embedding with subject,\nobject, and union box embeddings from a scene. We systematically explore the\ndesign of CLIP-based subject, object, and union-box representations within the\nUVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate\nEstimation). CREPE utilizes text-based representations for all three bounding\nboxes and introduces a novel contrastive training strategy to automatically\ninfer the text prompt for union-box. Our approach achieves state-of-the-art\nperformance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual\nGenome benchmark, achieving a 15.3\\% gain in performance over recent\nstate-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in\nobject relation prediction and encourages further research on VLMs in this\nchallenging domain.\n","authors":["Rakshith Subramanyam","T. S. Jayram","Rushil Anirudh","Jayaraman J. Thiagarajan"],"pdf_url":"https://arxiv.org/pdf/2307.04838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07894v3","updated":"2023-07-19T15:57:12Z","published":"2023-06-13T16:39:39Z","title":"iSLAM: Imperative SLAM","summary":" Simultaneous localization and mapping (SLAM) stands as one of the critical\nchallenges in robot navigation. Recent advancements suggest that methods based\non supervised learning deliver impressive performance in front-end odometry,\nwhile traditional optimization-based methods still play a vital role in the\nback-end for minimizing estimation drift. In this paper, we found that such\ndecoupled paradigm can lead to only sub-optimal performance, consequently\ncurtailing system capabilities and generalization potential. To solve this\nproblem, we proposed a novel self-supervised learning framework, imperative\nSLAM (iSLAM), which fosters reciprocal correction between the front-end and\nback-end, thus enhancing performance without necessitating any external\nsupervision. Specifically, we formulate a SLAM system as a bi-level\noptimization problem so that the two components are bidirectionally connected.\nAs a result, the front-end model is able to learn global geometric knowledge\nobtained through pose graph optimization by back-propagating the residuals from\nthe back-end. This significantly improves the generalization ability of the\nentire system and thus achieves the accuracy improvement up to 45%. To the best\nof our knowledge, iSLAM is the first SLAM system showing that the front-end and\nback-end can learn jointly and mutually contribute to each other in a\nself-supervised manner.\n","authors":["Taimeng Fu","Shaoshu Su","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2306.07894v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10062v1","updated":"2023-07-19T15:33:11Z","published":"2023-07-19T15:33:11Z","title":"Unsupervised Accuracy Estimation of Deep Visual Models using\n Domain-Adaptive Adversarial Perturbation without Source Samples","summary":" Deploying deep visual models can lead to performance drops due to the\ndiscrepancies between source and target distributions. Several approaches\nleverage labeled source data to estimate target domain accuracy, but accessing\nlabeled source data is often prohibitively difficult due to data\nconfidentiality or resource limitations on serving devices. Our work proposes a\nnew framework to estimate model accuracy on unlabeled target data without\naccess to source data. We investigate the feasibility of using pseudo-labels\nfor accuracy estimation and evolve this idea into adopting recent advances in\nsource-free domain adaptation algorithms. Our approach measures the\ndisagreement rate between the source hypothesis and the target pseudo-labeling\nfunction, adapted from the source hypothesis. We mitigate the impact of\nerroneous pseudo-labels that may arise due to a high ideal joint hypothesis\nrisk by employing adaptive adversarial perturbation on the input of the target\nmodel. Our proposed source-free framework effectively addresses the challenging\ndistribution shift scenarios and outperforms existing methods requiring source\ndata and labels for training.\n","authors":["JoonHo Lee","Jae Oh Woo","Hankyu Moon","Kwonho Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10062v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10046v1","updated":"2023-07-19T15:22:06Z","published":"2023-07-19T15:22:06Z","title":"Divert More Attention to Vision-Language Object Tracking","summary":" Multimodal vision-language (VL) learning has noticeably pushed the tendency\ntoward generic intelligence owing to emerging large foundation models. However,\ntracking, as a fundamental vision problem, surprisingly enjoys less bonus from\nrecent flourishing VL learning. We argue that the reasons are two-fold: the\nlack of large-scale vision-language annotated videos and ineffective\nvision-language interaction learning of current works. These nuisances motivate\nus to design more effective vision-language representation for tracking,\nmeanwhile constructing a large database with language annotation for model\nlearning. Particularly, in this paper, we first propose a general attribute\nannotation strategy to decorate videos in six popular tracking benchmarks,\nwhich contributes a large-scale vision-language tracking database with more\nthan 23,000 videos. We then introduce a novel framework to improve tracking by\nlearning a unified-adaptive VL representation, where the cores are the proposed\nasymmetric architecture search and modality mixer (ModaMixer). To further\nimprove VL representation, we introduce a contrastive loss to align different\nmodalities. To thoroughly evidence the effectiveness of our method, we\nintegrate the proposed framework on three tracking methods with different\ndesigns, i.e., the CNN-based SiamCAR, the Transformer-based OSTrack, and the\nhybrid structure TransT. The experiments demonstrate that our framework can\nsignificantly improve all baselines on six benchmarks. Besides empirical\nresults, we theoretically analyze our approach to show its rationality. By\nrevealing the potential of VL representation, we expect the community to divert\nmore attention to VL tracking and hope to open more possibilities for future\ntracking with diversified multimodal messages.\n","authors":["Mingzhe Guo","Zhipeng Zhang","Liping Jing","Haibin Ling","Heng Fan"],"pdf_url":"https://arxiv.org/pdf/2307.10046v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10036v1","updated":"2023-07-19T15:19:02Z","published":"2023-07-19T15:19:02Z","title":"Class Attention to Regions of Lesion for Imbalanced Medical Image\n Recognition","summary":" Automated medical image classification is the key component in intelligent\ndiagnosis systems. However, most medical image datasets contain plenty of\nsamples of common diseases and just a handful of rare ones, leading to major\nclass imbalances. Currently, it is an open problem in intelligent diagnosis to\neffectively learn from imbalanced training data. In this paper, we propose a\nsimple yet effective framework, named \\textbf{C}lass \\textbf{A}ttention to\n\\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by\nembedding attention into the training process of \\textbf{C}onvolutional\n\\textbf{N}eural \\textbf{N}etworks (CNNs). The proposed attention module helps\nCNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn\ntheir characteristics more effectively. In addition, this attention module\nworks only during the training phase and does not change the architecture of\nthe original network, so it can be directly combined with any existing CNN\narchitecture. The CARE framework needs bounding boxes to represent the lesion\nregions of rare diseases. To alleviate the need for manual annotation, we\nfurther developed variants of CARE by leveraging the traditional saliency\nmethods or a pretrained segmentation model for bounding box generation. Results\nshow that the CARE variants with automated bounding box generation are\ncomparable to the original CARE framework with \\textit{manual} bounding box\nannotations. A series of experiments on an imbalanced skin image dataset and a\npneumonia dataset indicates that our method can effectively help the network\nfocus on the lesion regions of rare diseases and remarkably improves the\nclassification performance of rare diseases.\n","authors":["Jia-Xin Zhuang","Jiabin Cai","Jianguo Zhang","Wei-shi Zheng","Ruixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10036v1.pdf","comment":"Accepted by Neurocomputing on July 2023. 37 pages"},{"id":"http://arxiv.org/abs/2307.06385v2","updated":"2023-07-19T14:51:37Z","published":"2023-07-12T18:13:58Z","title":"Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event\n Localization","summary":" Audio-Visual Event Localization (AVEL) is the task of temporally localizing\nand classifying \\emph{audio-visual events}, i.e., events simultaneously visible\nand audible in a video. In this paper, we solve AVEL in a weakly-supervised\nsetting, where only video-level event labels (their presence/absence, but not\ntheir locations in time) are available as supervision for training. Our idea is\nto use a base model to estimate labels on the training data at a finer temporal\nresolution than at the video level and re-train the model with these labels.\nI.e., we determine the subset of labels for each \\emph{slice} of frames in a\ntraining video by (i) replacing the frames outside the slice with those from a\nsecond video having no overlap in video-level labels, and (ii) feeding this\nsynthetic video into the base model to extract labels for just the slice in\nquestion. To handle the out-of-distribution nature of our synthetic videos, we\npropose an auxiliary objective for the base model that induces more reliable\npredictions of the localized event labels as desired. Our three-stage pipeline\noutperforms several existing AVEL methods with no architectural changes and\nimproves performance on a related weakly-supervised task as well.\n","authors":["Kalyan Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2307.06385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10011v1","updated":"2023-07-19T14:49:14Z","published":"2023-07-19T14:49:14Z","title":"Towards Fair Face Verification: An In-depth Analysis of Demographic\n Biases","summary":" Deep learning-based person identification and verification systems have\nremarkably improved in terms of accuracy in recent years; however, such\nsystems, including widely popular cloud-based solutions, have been found to\nexhibit significant biases related to race, age, and gender, a problem that\nrequires in-depth exploration and solutions. This paper presents an in-depth\nanalysis, with a particular emphasis on the intersectionality of these\ndemographic factors. Intersectional bias refers to the performance\ndiscrepancies w.r.t. the different combinations of race, age, and gender\ngroups, an area relatively unexplored in current literature. Furthermore, the\nreliance of most state-of-the-art approaches on accuracy as the principal\nevaluation metric often masks significant demographic disparities in\nperformance. To counter this crucial limitation, we incorporate five additional\nmetrics in our quantitative analysis, including disparate impact and\nmistreatment metrics, which are typically ignored by the relevant\nfairness-aware approaches. Results on the Racial Faces in-the-Wild (RFW)\nbenchmark indicate pervasive biases in face recognition systems, extending\nbeyond race, with different demographic factors yielding significantly\ndisparate outcomes. In particular, Africans demonstrate an 11.25% lower True\nPositive Rate (TPR) compared to Caucasians, while only a 3.51% accuracy drop is\nobserved. Even more concerning, the intersections of multiple protected groups,\nsuch as African females over 60 years old, demonstrate a +39.89% disparate\nmistreatment rate compared to the highest Caucasians rate. By shedding light on\nthese biases and their implications, this paper aims to stimulate further\nresearch towards developing fairer, more equitable face recognition and\nverification systems.\n","authors":["Ioannis Sarridis","Christos Koutlis","Symeon Papadopoulos","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2307.10011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10008v1","updated":"2023-07-19T14:45:11Z","published":"2023-07-19T14:45:11Z","title":"MODA: Mapping-Once Audio-driven Portrait Animation with Dual Attentions","summary":" Audio-driven portrait animation aims to synthesize portrait videos that are\nconditioned by given audio. Animating high-fidelity and multimodal video\nportraits has a variety of applications. Previous methods have attempted to\ncapture different motion modes and generate high-fidelity portrait videos by\ntraining different models or sampling signals from given videos. However,\nlacking correlation learning between lip-sync and other movements (e.g., head\npose/eye blinking) usually leads to unnatural results. In this paper, we\npropose a unified system for multi-person, diverse, and high-fidelity talking\nportrait generation. Our method contains three stages, i.e., 1) Mapping-Once\nnetwork with Dual Attentions (MODA) generates talking representation from given\naudio. In MODA, we design a dual-attention module to encode accurate mouth\nmovements and diverse modalities. 2) Facial composer network generates dense\nand detailed face landmarks, and 3) temporal-guided renderer syntheses stable\nvideos. Extensive evaluations demonstrate that the proposed system produces\nmore natural and realistic video portraits compared to previous methods.\n","authors":["Yunfei Liu","Lijian Lin","Fei Yu","Changyin Zhou","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2307.10008v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09456v2","updated":"2023-07-19T14:27:57Z","published":"2023-07-18T17:35:45Z","title":"A comparative analysis of SRGAN models","summary":" In this study, we evaluate the performance of multiple state-of-the-art SRGAN\n(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN\nand EDSR, on a benchmark dataset of real-world images which undergo degradation\nusing a pipeline. Our results show that some models seem to significantly\nincrease the resolution of the input images while preserving their visual\nquality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE\nmodel from huggingface outperforms the remaining candidate models in terms of\nboth quantitative metrics and subjective visual quality assessments with least\ncompute overhead. Specifically, EDSR generates images with higher peak\nsignal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and\nare seen to return high quality OCR results with Tesseract OCR engine. These\nfindings suggest that EDSR is a robust and effective approach for single-image\nsuper-resolution and may be particularly well-suited for applications where\nhigh-quality visual fidelity is critical and optimized compute.\n","authors":["Fatemeh Rezapoor Nikroo","Ajinkya Deshmukh","Anantha Sharma","Adrian Tam","Kaarthik Kumar","Cleo Norris","Aditya Dangi"],"pdf_url":"https://arxiv.org/pdf/2307.09456v2.pdf","comment":"9 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2307.10003v1","updated":"2023-07-19T14:23:26Z","published":"2023-07-19T14:23:26Z","title":"TbExplain: A Text-based Explanation Method for Scene Classification\n Models with the Statistical Prediction Correction","summary":" The field of Explainable Artificial Intelligence (XAI) aims to improve the\ninterpretability of black-box machine learning models. Building a heatmap based\non the importance value of input features is a popular method for explaining\nthe underlying functions of such models in producing their predictions.\nHeatmaps are almost understandable to humans, yet they are not without flaws.\nNon-expert users, for example, may not fully understand the logic of heatmaps\n(the logic in which relevant pixels to the model's prediction are highlighted\nwith different intensities or colors). Additionally, objects and regions of the\ninput image that are relevant to the model prediction are frequently not\nentirely differentiated by heatmaps. In this paper, we propose a framework\ncalled TbExplain that employs XAI techniques and a pre-trained object detector\nto present text-based explanations of scene classification models. Moreover,\nTbExplain incorporates a novel method to correct predictions and textually\nexplain them based on the statistics of objects in the input image when the\ninitial prediction is unreliable. To assess the trustworthiness and validity of\nthe text-based explanations, we conducted a qualitative experiment, and the\nfindings indicated that these explanations are sufficiently reliable.\nFurthermore, our quantitative and qualitative experiments on TbExplain with\nscene classification datasets reveal an improvement in classification accuracy\nover ResNet variants.\n","authors":["Amirhossein Aminimehr","Pouya Khani","Amirali Molaei","Amirmohammad Kazemeini","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.10003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10001v1","updated":"2023-07-19T14:21:11Z","published":"2023-07-19T14:21:11Z","title":"As large as it gets: Learning infinitely large Filters via Neural\n Implicit Functions in the Fourier Domain","summary":" Motivated by the recent trend towards the usage of larger receptive fields\nfor more context-aware neural networks in vision applications, we aim to\ninvestigate how large these receptive fields really need to be. To facilitate\nsuch study, several challenges need to be addressed, most importantly: (i) We\nneed to provide an effective way for models to learn large filters (potentially\nas large as the input data) without increasing their memory consumption during\ntraining or inference, (ii) the study of filter sizes has to be decoupled from\nother effects such as the network width or number of learnable parameters, and\n(iii) the employed convolution operation should be a plug-and-play module that\ncan replace any conventional convolution in a Convolutional Neural Network\n(CNN) and allow for an efficient implementation in current frameworks. To\nfacilitate such models, we propose to learn not spatial but frequency\nrepresentations of filter weights as neural implicit functions, such that even\ninfinitely large filters can be parameterized by only a few learnable weights.\nThe resulting neural implicit frequency CNNs are the first models to achieve\nresults on par with the state-of-the-art on large image classification\nbenchmarks while executing convolutions solely in the frequency domain and can\nbe employed within any CNN architecture. They allow us to provide an extensive\nanalysis of the learned receptive fields. Interestingly, our analysis shows\nthat, although the proposed networks could learn very large convolution\nkernels, the learned filters practically translate into well-localized and\nrelatively small convolution kernels in the spatial domain.\n","authors":["Julia Grabinski","Janis Keuper","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.10001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08913v2","updated":"2023-07-19T14:18:00Z","published":"2023-07-18T01:16:23Z","title":"Towards the Sparseness of Projection Head in Self-Supervised Learning","summary":" In recent years, self-supervised learning (SSL) has emerged as a promising\napproach for extracting valuable representations from unlabeled data. One\nsuccessful SSL method is contrastive learning, which aims to bring positive\nexamples closer while pushing negative examples apart. Many current contrastive\nlearning approaches utilize a parameterized projection head. Through a\ncombination of empirical analysis and theoretical investigation, we provide\ninsights into the internal mechanisms of the projection head and its\nrelationship with the phenomenon of dimensional collapse. Our findings\ndemonstrate that the projection head enhances the quality of representations by\nperforming contrastive loss in a projected subspace. Therefore, we propose an\nassumption that only a subset of features is necessary when minimizing the\ncontrastive loss of a mini-batch of data. Theoretical analysis further suggests\nthat a sparse projection head can enhance generalization, leading us to\nintroduce SparseHead - a regularization term that effectively constrains the\nsparsity of the projection head, and can be seamlessly integrated with any\nself-supervised learning (SSL) approaches. Our experimental results validate\nthe effectiveness of SparseHead, demonstrating its ability to improve the\nperformance of existing contrastive methods.\n","authors":["Zeen Song","Xingzhe Su","Jingyao Wang","Wenwen Qiang","Changwen Zheng","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2307.08913v2.pdf","comment":"9 pages,3 figures"},{"id":"http://arxiv.org/abs/2307.09997v1","updated":"2023-07-19T14:10:55Z","published":"2023-07-19T14:10:55Z","title":"TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical\n Phase Recognition","summary":" To enable context-aware computer assistance in the operating room of the\nfuture, cognitive systems need to understand automatically which surgical phase\nis being performed by the medical team. The primary source of information for\nsurgical phase recognition is typically video, which presents two challenges:\nextracting meaningful features from the video stream and effectively modeling\ntemporal information in the sequence of visual features. For temporal modeling,\nattention mechanisms have gained popularity due to their ability to capture\nlong-range dependencies. In this paper, we explore design choices for attention\nin existing temporal models for surgical phase recognition and propose a novel\napproach that does not resort to local attention or regularization of attention\nweights: TUNeS is an efficient and simple temporal model that incorporates\nself-attention at the coarsest stage of a U-Net-like structure. In addition, we\npropose to train the feature extractor, a standard CNN, together with an LSTM\non preferably long video segments, i.e., with long temporal context. In our\nexperiments, all temporal models performed better on top of feature extractors\nthat were trained with longer temporal context. On top of these contextualized\nfeatures, TUNeS achieves state-of-the-art results on Cholec80.\n","authors":["Isabel Funke","Dominik Rivoir","Stefanie Krell","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2307.09997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09994v1","updated":"2023-07-19T13:58:01Z","published":"2023-07-19T13:58:01Z","title":"Impact of Disentanglement on Pruning Neural Networks","summary":" Deploying deep learning neural networks on edge devices, to accomplish task\nspecific objectives in the real-world, requires a reduction in their memory\nfootprint, power consumption, and latency. This can be realized via efficient\nmodel compression. Disentangled latent representations produced by variational\nautoencoder (VAE) networks are a promising approach for achieving model\ncompression because they mainly retain task-specific information, discarding\nuseless information for the task at hand. We make use of the Beta-VAE framework\ncombined with a standard criterion for pruning to investigate the impact of\nforcing the network to learn disentangled representations on the pruning\nprocess for the task of classification. In particular, we perform experiments\non MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose\na path forward for future works.\n","authors":["Carl Shneider","Peyman Rostami","Anis Kacem","Nilotpal Sinha","Abd El Rahman Shabayek","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2307.09994v1.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2307.08347v2","updated":"2023-07-19T13:55:32Z","published":"2023-07-17T09:38:41Z","title":"M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models\n and Latent Space Geometry Optimization","summary":" Medical vision-language models enable co-learning and integrating features\nfrom medical imaging and clinical text. However, these models are not easy to\ntrain and the latent representation space can be complex. Here we propose a\nnovel way for pre-training and regularising medical vision-language models. The\nproposed method, named Medical vision-language pre-training with Frozen\nlanguage models and Latent spAce Geometry optimization (M-FLAG), leverages a\nfrozen language model for training stability and efficiency and introduces a\nnovel orthogonality loss to harmonize the latent space geometry. We demonstrate\nthe potential of the pre-trained model on three downstream tasks: medical image\nclassification, segmentation, and object detection. Extensive experiments\nacross five public datasets demonstrate that M-FLAG significantly outperforms\nexisting medical vision-language pre-training approaches and reduces the number\nof parameters by 78\\%. Notably, M-FLAG achieves outstanding performance on the\nsegmentation task while using only 1\\% of the RSNA dataset, even outperforming\nImageNet pre-trained models that have been fine-tuned using 100\\% of the data.\n","authors":["Che Liu","Sibo Cheng","Chen Chen","Mengyun Qiao","Weitong Zhang","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2307.08347v2.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.09988v1","updated":"2023-07-19T13:49:12Z","published":"2023-07-19T13:49:12Z","title":"TinyTrain: Deep Neural Network Training at the Extreme Edge","summary":" On-device training is essential for user personalisation and privacy. With\nthe pervasiveness of IoT devices and microcontroller units (MCU), this task\nbecomes more challenging due to the constrained memory and compute resources,\nand the limited availability of labelled user data. Nonetheless, prior works\nneglect the data scarcity issue, require excessively long training time (e.g. a\nfew hours), or induce substantial accuracy loss ($\\geq$10\\%). We propose\nTinyTrain, an on-device training approach that drastically reduces training\ntime by selectively updating parts of the model and explicitly coping with data\nscarcity. TinyTrain introduces a task-adaptive sparse-update method that\ndynamically selects the layer/channel based on a multi-objective criterion that\njointly captures user data, the memory, and the compute capabilities of the\ntarget device, leading to high accuracy on unseen tasks with reduced\ncomputation and memory footprint. TinyTrain outperforms vanilla fine-tuning of\nthe entire network by 3.6-5.0\\% in accuracy, while reducing the backward-pass\nmemory and computation cost by up to 2,286$\\times$ and 7.68$\\times$,\nrespectively. Targeting broadly used real-world edge devices, TinyTrain\nachieves 9.5$\\times$ faster and 3.5$\\times$ more energy-efficient training over\nstatus-quo approaches, and 2.8$\\times$ smaller memory footprint than SOTA\napproaches, while remaining within the 1 MB memory envelope of MCU-grade\nplatforms.\n","authors":["Young D. Kwon","Rui Li","Stylianos I. Venieris","Jagmohan Chauhan","Nicholas D. Lane","Cecilia Mascolo"],"pdf_url":"https://arxiv.org/pdf/2307.09988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09981v1","updated":"2023-07-19T13:40:45Z","published":"2023-07-19T13:40:45Z","title":"Lazy Visual Localization via Motion Averaging","summary":" Visual (re)localization is critical for various applications in computer\nvision and robotics. Its goal is to estimate the 6 degrees of freedom (DoF)\ncamera pose for each query image, based on a set of posed database images.\nCurrently, all leading solutions are structure-based that either explicitly\nconstruct 3D metric maps from the database with structure-from-motion, or\nimplicitly encode the 3D information with scene coordinate regression models.\nOn the contrary, visual localization without reconstructing the scene in 3D\noffers clear benefits. It makes deployment more convenient by reducing database\npre-processing time, releasing storage requirements, and remaining unaffected\nby imperfect reconstruction, etc. In this technical report, we demonstrate that\nit is possible to achieve high localization accuracy without reconstructing the\nscene from the database. The key to achieving this owes to a tailored motion\naveraging over database-query pairs. Experiments show that our visual\nlocalization proposal, LazyLoc, achieves comparable performance against\nstate-of-the-art structure-based methods. Furthermore, we showcase the\nversatility of LazyLoc, which can be easily extended to handle complex\nconfigurations such as multi-query co-localization and camera rigs.\n","authors":["Siyan Dong","Shaohui Liu","Hengkai Guo","Baoquan Chen","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2307.09981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09362v2","updated":"2023-07-19T13:21:30Z","published":"2023-07-18T15:46:21Z","title":"Disentangle then Parse:Night-time Semantic Segmentation with\n Illumination Disentanglement","summary":" Most prior semantic segmentation methods have been developed for day-time\nscenes, while typically underperforming in night-time scenes due to\ninsufficient and complicated lighting conditions. In this work, we tackle this\nchallenge by proposing a novel night-time semantic segmentation paradigm, i.e.,\ndisentangle then parse (DTP). DTP explicitly disentangles night-time images\ninto light-invariant reflectance and light-specific illumination components and\nthen recognizes semantics based on their adaptive fusion. Concretely, the\nproposed DTP comprises two key components: 1) Instead of processing\nlighting-entangled features as in prior works, our Semantic-Oriented\nDisentanglement (SOD) framework enables the extraction of reflectance component\nwithout being impeded by lighting, allowing the network to consistently\nrecognize the semantics under cover of varying and complicated lighting\nconditions. 2) Based on the observation that the illumination component can\nserve as a cue for some semantically confused regions, we further introduce an\nIllumination-Aware Parser (IAParser) to explicitly learn the correlation\nbetween semantics and lighting, and aggregate the illumination features to\nyield more precise predictions. Extensive experiments on the night-time\nsegmentation task with various settings demonstrate that DTP significantly\noutperforms state-of-the-art methods. Furthermore, with negligible additional\nparameters, DTP can be directly used to benefit existing day-time methods for\nnight-time segmentation.\n","authors":["Zhixiang Wei","Lin Chen","Tao Tu","Huaian Chen","Pengyang Ling","Yi Jin"],"pdf_url":"https://arxiv.org/pdf/2307.09362v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.09946v2","updated":"2023-07-19T13:15:08Z","published":"2023-05-17T04:56:11Z","title":"AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for\n Survival Outcome Prediction from PET/CT Images","summary":" Survival prediction is a major concern for cancer management. Deep survival\nmodels based on deep learning have been widely adopted to perform end-to-end\nsurvival prediction from medical images. Recent deep survival models achieved\npromising performance by jointly performing tumor segmentation with survival\nprediction, where the models were guided to extract tumor-related information\nthrough Multi-Task Learning (MTL). However, these deep survival models have\ndifficulties in exploring out-of-tumor prognostic information. In addition,\nexisting deep survival models are unable to effectively leverage multi-modality\nimages. Empirically-designed fusion strategies were commonly adopted to fuse\nmulti-modality information via task-specific manually-designed networks, thus\nlimiting the adaptability to different scenarios. In this study, we propose an\nAdaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival\nprediction from PET/CT images. Instead of adopting MTL, we propose a novel\nSegmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained\nfor tumor segmentation and survival prediction sequentially in two stages. This\nstrategy enables the AdaMSS to focus on tumor regions in the first stage and\ngradually expand its focus to include other prognosis-related regions in the\nsecond stage. We also propose a data-driven strategy to fuse multi-modality\ninformation, which realizes adaptive optimization of fusion strategies based on\ntraining data during training. With the SSL and data-driven fusion strategies,\nour AdaMSS is designed as an adaptive model that can self-adapt its focus\nregions and fusion strategy for different training stages. Extensive\nexperiments with two large clinical datasets show that our AdaMSS outperforms\nstate-of-the-art survival prediction methods.\n","authors":["Mingyuan Meng","Bingxin Gu","Michael Fulham","Shaoli Song","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2305.09946v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2305.18060v2","updated":"2023-07-19T13:13:39Z","published":"2023-05-29T12:53:54Z","title":"Mining Negative Temporal Contexts For False Positive Suppression In\n Real-Time Ultrasound Lesion Detection","summary":" During ultrasonic scanning processes, real-time lesion detection can assist\nradiologists in accurate cancer diagnosis. However, this essential task remains\nchallenging and underexplored. General-purpose real-time object detection\nmodels can mistakenly report obvious false positives (FPs) when applied to\nultrasound videos, potentially misleading junior radiologists. One key issue is\ntheir failure to utilize negative symptoms in previous frames, denoted as\nnegative temporal contexts (NTC). To address this issue, we propose to extract\ncontexts from previous frames, including NTC, with the guidance of inverse\noptical flow. By aggregating extracted contexts, we endow the model with the\nability to suppress FPs by leveraging NTC. We call the resulting model\nUltraDet. The proposed UltraDet demonstrates significant improvement over\nprevious state-of-the-arts and achieves real-time inference speed. We release\nthe code, checkpoints, and high-quality labels of the CVA-BUS dataset in\nhttps://github.com/HaojunYu1998/UltraDet.\n","authors":["Haojun Yu","Youcheng Li","QuanLin Wu","Ziwei Zhao","Dengbo Chen","Dong Wang","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2305.18060v2.pdf","comment":"10 pages, 4 figures, MICCAI 2023 Early Accept"},{"id":"http://arxiv.org/abs/2001.05887v4","updated":"2023-07-19T12:58:18Z","published":"2020-01-16T15:24:26Z","title":"MixPath: A Unified Approach for One-shot Neural Architecture Search","summary":" Blending multiple convolutional kernels is proved advantageous in neural\narchitecture design. However, current two-stage neural architecture search\nmethods are mainly limited to single-path search spaces. How to efficiently\nsearch models of multi-path structures remains a difficult problem. In this\npaper, we are motivated to train a one-shot multi-path supernet to accurately\nevaluate the candidate architectures. Specifically, we discover that in the\nstudied search spaces, feature vectors summed from multiple paths are nearly\nmultiples of those from a single path. Such disparity perturbs the supernet\ntraining and its ranking ability. Therefore, we propose a novel mechanism\ncalled Shadow Batch Normalization (SBN) to regularize the disparate feature\nstatistics. Extensive experiments prove that SBNs are capable of stabilizing\nthe optimization and improving ranking performance. We call our unified\nmulti-path one-shot approach as MixPath, which generates a series of models\nthat achieve state-of-the-art results on ImageNet.\n","authors":["Xiangxiang Chu","Shun Lu","Xudong Li","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2001.05887v4.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09947v1","updated":"2023-07-19T12:41:54Z","published":"2023-07-19T12:41:54Z","title":"U-CE: Uncertainty-aware Cross-Entropy for Semantic Segmentation","summary":" Deep neural networks have shown exceptional performance in various tasks, but\ntheir lack of robustness, reliability, and tendency to be overconfident pose\nchallenges for their deployment in safety-critical applications like autonomous\ndriving. In this regard, quantifying the uncertainty inherent to a model's\nprediction is a promising endeavour to address these shortcomings. In this\nwork, we present a novel Uncertainty-aware Cross-Entropy loss (U-CE) that\nincorporates dynamic predictive uncertainties into the training process by\npixel-wise weighting of the well-known cross-entropy loss (CE). Through\nextensive experimentation, we demonstrate the superiority of U-CE over regular\nCE training on two benchmark datasets, Cityscapes and ACDC, using two common\nbackbone architectures, ResNet-18 and ResNet-101. With U-CE, we manage to train\nmodels that not only improve their segmentation performance but also provide\nmeaningful uncertainties after training. Consequently, we contribute to the\ndevelopment of more robust and reliable segmentation models, ultimately\nadvancing the state-of-the-art in safety-critical applications and beyond.\n","authors":["Steven Landgraf","Markus Hillemann","Kira Wursthorn","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2307.09947v1.pdf","comment":"10 pages, 3 figures, 7 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2307.09944v1","updated":"2023-07-19T12:39:40Z","published":"2023-07-19T12:39:40Z","title":"ProtoCaps: A Fast and Non-Iterative Capsule Network Routing Method","summary":" Capsule Networks have emerged as a powerful class of deep learning\narchitectures, known for robust performance with relatively few parameters\ncompared to Convolutional Neural Networks (CNNs). However, their inherent\nefficiency is often overshadowed by their slow, iterative routing mechanisms\nwhich establish connections between Capsule layers, posing computational\nchallenges resulting in an inability to scale. In this paper, we introduce a\nnovel, non-iterative routing mechanism, inspired by trainable prototype\nclustering. This innovative approach aims to mitigate computational complexity,\nwhile retaining, if not enhancing, performance efficacy. Furthermore, we\nharness a shared Capsule subspace, negating the need to project each\nlower-level Capsule to each higher-level Capsule, thereby significantly\nreducing memory requisites during training. Our approach demonstrates superior\nresults compared to the current best non-iterative Capsule Network and tests on\nthe Imagewoof dataset, which is too computationally demanding to handle\nefficiently by iterative approaches. Our findings underscore the potential of\nour proposed methodology in enhancing the operational efficiency and\nperformance of Capsule Networks, paving the way for their application in\nincreasingly complex computational scenarios.\n","authors":["Miles Everett","Mingjun Zhong","Georgios Leontidis"],"pdf_url":"https://arxiv.org/pdf/2307.09944v1.pdf","comment":"8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.09936v1","updated":"2023-07-19T12:21:39Z","published":"2023-07-19T12:21:39Z","title":"AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point\n Clouds of Deformable Objects","summary":" This paper focuses on motion prediction for point cloud sequences in the\nchallenging case of deformable 3D objects, such as human body motion. First, we\ninvestigate the challenges caused by deformable shapes and complex motions\npresent in this type of representation, with the ultimate goal of understanding\nthe technical limitations of state-of-the-art models. From this understanding,\nwe propose an improved architecture for point cloud prediction of deformable 3D\nobjects. Specifically, to handle deformable shapes, we propose a graph-based\napproach that learns and exploits the spatial structure of point clouds to\nextract more representative features. Then we propose a module able to combine\nthe learned features in an adaptative manner according to the point cloud\nmovements. The proposed adaptative module controls the composition of local and\nglobal motions for each point, enabling the network to model complex motions in\ndeformable 3D objects more effectively. We tested the proposed method on the\nfollowing datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG\nand CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that\nour method outperforms the current baseline methods given its improved ability\nto model complex movements as well as preserve point cloud shape. Furthermore,\nwe demonstrate the generalizability of the proposed framework for dynamic\nfeature learning, by testing the framework for action recognition on the\nMSRAction3D dataset and achieving results on-par with state-of-the-art methods\n","authors":["Pedro Gomes","Silvia Rossi","Laura Toni"],"pdf_url":"https://arxiv.org/pdf/2307.09936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09933v1","updated":"2023-07-19T12:15:06Z","published":"2023-07-19T12:15:06Z","title":"Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to\n Harness Spurious Features","summary":" To avoid failures on out-of-distribution data, recent works have sought to\nextract features that have a stable or invariant relationship with the label\nacross domains, discarding the \"spurious\" or unstable features whose\nrelationship with the label changes across domains. However, unstable features\noften carry complementary information about the label that could boost\nperformance if used correctly in the test domain. Our main contribution is to\nshow that it is possible to learn how to use these unstable features in the\ntest domain without labels. In particular, we prove that pseudo-labels based on\nstable features provide sufficient guidance for doing so, provided that stable\nand unstable features are conditionally independent given the label. Based on\nthis theoretical insight, we propose Stable Feature Boosting (SFB), an\nalgorithm for: (i) learning a predictor that separates stable and\nconditionally-independent unstable features; and (ii) using the stable-feature\npredictions to adapt the unstable-feature predictions in the test domain.\nTheoretically, we prove that SFB can learn an asymptotically-optimal predictor\nwithout test-domain labels. Empirically, we demonstrate the effectiveness of\nSFB on real and synthetic data.\n","authors":["Cian Eastwood","Shashank Singh","Andrei Liviu Nicolicioiu","Marin Vlastelica","Julius von Kügelgen","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2307.09933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09931v1","updated":"2023-07-19T12:12:17Z","published":"2023-07-19T12:12:17Z","title":"DISA: DIfferentiable Similarity Approximation for Universal Multimodal\n Registration","summary":" Multimodal image registration is a challenging but essential step for\nnumerous image-guided procedures. Most registration algorithms rely on the\ncomputation of complex, frequently non-differentiable similarity metrics to\ndeal with the appearance discrepancy of anatomical structures between imaging\nmodalities. Recent Machine Learning based approaches are limited to specific\nanatomy-modality combinations and do not generalize to new settings. We propose\na generic framework for creating expressive cross-modal descriptors that enable\nfast deformable global registration. We achieve this by approximating existing\nmetrics with a dot-product in the feature space of a small convolutional neural\nnetwork (CNN) which is inherently differentiable can be trained without\nregistered data. Our method is several orders of magnitude faster than local\npatch-based metrics and can be directly applied in clinical settings by\nreplacing the similarity measure with the proposed one. Experiments on three\ndifferent datasets demonstrate that our approach generalizes well beyond the\ntraining data, yielding a broad capture range even on unseen anatomies and\nmodality pairs, without the need for specialized retraining. We make our\ntraining code and data publicly available.\n","authors":["Matteo Ronchetti","Wolfgang Wein","Nassir Navab","Oliver Zettinig","Raphael Prevost"],"pdf_url":"https://arxiv.org/pdf/2307.09931v1.pdf","comment":"This preprint was submitted to MICCAI 2023. The Version of Record of\n this contribution will be published in Springer LNCS"},{"id":"http://arxiv.org/abs/2307.09929v1","updated":"2023-07-19T12:11:15Z","published":"2023-07-19T12:11:15Z","title":"Measuring and Modeling Uncertainty Degree for Monocular Depth Estimation","summary":" Effectively measuring and modeling the reliability of a trained model is\nessential to the real-world deployment of monocular depth estimation (MDE)\nmodels. However, the intrinsic ill-posedness and ordinal-sensitive nature of\nMDE pose major challenges to the estimation of uncertainty degree of the\ntrained models. On the one hand, utilizing current uncertainty modeling methods\nmay increase memory consumption and are usually time-consuming. On the other\nhand, measuring the uncertainty based on model accuracy can also be\nproblematic, where uncertainty reliability and prediction accuracy are not well\ndecoupled. In this paper, we propose to model the uncertainty of MDE models\nfrom the perspective of the inherent probability distributions originating from\nthe depth probability volume and its extensions, and to assess it more fairly\nwith more comprehensive metrics. By simply introducing additional training\nregularization terms, our model, with surprisingly simple formations and\nwithout requiring extra modules or multiple inferences, can provide uncertainty\nestimations with state-of-the-art reliability, and can be further improved when\ncombined with ensemble or sampling methods. A series of experiments demonstrate\nthe effectiveness of our methods.\n","authors":["Mochu Xiang","Jing Zhang","Nick Barnes","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2307.09929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04639v2","updated":"2023-07-19T12:08:51Z","published":"2023-07-10T15:35:31Z","title":"Multimodal brain age estimation using interpretable adaptive\n population-graph learning","summary":" Brain age estimation is clinically important as it can provide valuable\ninformation in the context of neurodegenerative diseases such as Alzheimer's.\nPopulation graphs, which include multimodal imaging information of the subjects\nalong with the relationships among the population, have been used in literature\nalong with Graph Convolutional Networks (GCNs) and have proved beneficial for a\nvariety of medical imaging tasks. A population graph is usually static and\nconstructed manually using non-imaging information. However, graph construction\nis not a trivial task and might significantly affect the performance of the\nGCN, which is inherently very sensitive to the graph structure. In this work,\nwe propose a framework that learns a population graph structure optimized for\nthe downstream task. An attention mechanism assigns weights to a set of imaging\nand non-imaging features (phenotypes), which are then used for edge extraction.\nThe resulting graph is used to train the GCN. The entire pipeline can be\ntrained end-to-end. Additionally, by visualizing the attention weights that\nwere the most important for the graph construction, we increase the\ninterpretability of the graph. We use the UK Biobank, which provides a large\nvariety of neuroimaging and non-imaging phenotypes, to evaluate our method on\nbrain age regression and classification. The proposed method outperforms\ncompeting static graph approaches and other state-of-the-art adaptive methods.\nWe further show that the assigned attention scores indicate that there are both\nimaging and non-imaging phenotypes that are informative for brain age\nestimation and are in agreement with the relevant literature.\n","authors":["Kyriaki-Margarita Bintsi","Vasileios Baltatzis","Rolandos Alexandros Potamias","Alexander Hammers","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.04639v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2303.06635v2","updated":"2023-07-19T12:05:29Z","published":"2023-03-12T11:23:56Z","title":"Schema Inference for Interpretable Image Classification","summary":" In this paper, we study a novel inference paradigm, termed as schema\ninference, that learns to deductively infer the explainable predictions by\nrebuilding the prior deep neural network (DNN) forwarding scheme, guided by the\nprevalent philosophical cognitive concept of schema. We strive to reformulate\nthe conventional model inference pipeline into a graph matching policy that\nassociates the extracted visual concepts of an image with the pre-computed\nscene impression, by analogy with human reasoning mechanism via impression\nmatching. To this end, we devise an elaborated architecture, termed as\nSchemaNet, as a dedicated instantiation of the proposed schema inference\nconcept, that models both the visual semantics of input instances and the\nlearned abstract imaginations of target categories as topological relational\ngraphs. Meanwhile, to capture and leverage the compositional contributions of\nvisual semantics in a global view, we also introduce a universal Feat2Graph\nscheme in SchemaNet to establish the relational graphs that contain abundant\ninteraction information. Both the theoretical analysis and the experimental\nresults on several benchmarks demonstrate that the proposed schema inference\nachieves encouraging performance and meanwhile yields a clear picture of the\ndeductive process leading to the predictions. Our code is available at\nhttps://github.com/zhfeing/SchemaNet-PyTorch.\n","authors":["Haofei Zhang","Mengqi Xue","Xiaokang Liu","Kaixuan Chen","Jie Song","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2303.06635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07591v3","updated":"2023-07-19T12:04:59Z","published":"2023-06-13T07:35:28Z","title":"I See Dead People: Gray-Box Adversarial Attack on Image-To-Text Models","summary":" Modern image-to-text systems typically adopt the encoder-decoder framework,\nwhich comprises two main components: an image encoder, responsible for\nextracting image features, and a transformer-based decoder, used for generating\ncaptions. Taking inspiration from the analysis of neural networks' robustness\nagainst adversarial perturbations, we propose a novel gray-box algorithm for\ncreating adversarial examples in image-to-text models. Unlike image\nclassification tasks that have a finite set of class labels, finding visually\nsimilar adversarial examples in an image-to-text task poses greater challenges\nbecause the captioning system allows for a virtually infinite space of possible\ncaptions. In this paper, we present a gray-box adversarial attack on\nimage-to-text, both untargeted and targeted. We formulate the process of\ndiscovering adversarial perturbations as an optimization problem that uses only\nthe image-encoder component, meaning the proposed attack is language-model\nagnostic. Through experiments conducted on the ViT-GPT2 model, which is the\nmost-used image-to-text model in Hugging Face, and the Flickr30k dataset, we\ndemonstrate that our proposed attack successfully generates visually similar\nadversarial examples, both with untargeted and targeted captions. Notably, our\nattack operates in a gray-box manner, requiring no knowledge about the decoder\nmodule. We also show that our attacks fool the popular open-source platform\nHugging Face.\n","authors":["Raz Lapid","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2306.07591v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03544v2","updated":"2023-07-19T11:46:34Z","published":"2021-10-07T15:06:52Z","title":"RAR: Region-Aware Point Cloud Registration","summary":" This paper concerns the research problem of point cloud registration to find\nthe rigid transformation to optimally align the source point set with the\ntarget one. Learning robust point cloud registration models with deep neural\nnetworks has emerged as a powerful paradigm, offering promising performance in\npredicting the global geometric transformation for a pair of point sets.\nExisting methods firstly leverage an encoder to regress a latent shape\nembedding, which is then decoded into a shape-conditioned transformation via\nconcatenation-based conditioning. However, different regions of a 3D shape vary\nin their geometric structures which makes it more sense that we have a\nregion-conditioned transformation instead of the shape-conditioned one. In this\npaper we present a \\underline{R}egion-\\underline{A}ware point cloud\n\\underline{R}egistration, denoted as RAR, to predict transformation for\npairwise point sets in the self-supervised learning fashion. More specifically,\nwe develop a novel region-aware decoder (RAD) module that is formed with an\nimplicit neural region representation parameterized by neural networks. The\nimplicit neural region representation is learned with a self-supervised 3D\nshape reconstruction loss without the need for region labels. Consequently, the\nregion-aware decoder (RAD) module guides the training of the region-aware\ntransformation (RAT) module and region-aware weight (RAW) module, which predict\nthe transforms and weights for different regions respectively. The global\ngeometric transformation from source point set to target one is then formed by\nthe weighted fusion of region-aware transforms. Compared to the\nstate-of-the-art approaches, our experiments show that our RAR achieves\nsuperior registration performance over various benchmark datasets (e.g.\nModelNet40).\n","authors":["Yu Hao","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2110.03544v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2006.06200"},{"id":"http://arxiv.org/abs/2307.09915v1","updated":"2023-07-19T11:35:21Z","published":"2023-07-19T11:35:21Z","title":"Embedded Heterogeneous Attention Transformer for Cross-lingual Image\n Captioning","summary":" Cross-lingual image captioning is confronted with both cross-lingual and\ncross-modal challenges for multimedia analysis. The crucial issue in this task\nis to model the global and local matching between the image and different\nlanguages. Existing cross-modal embedding methods based on Transformer\narchitecture oversight the local matching between the image region and\nmonolingual words, not to mention in the face of a variety of differentiated\nlanguages. Due to the heterogeneous property of the cross-modal and\ncross-lingual task, we utilize the heterogeneous network to establish\ncross-domain relationships and the local correspondences between the image and\ndifferent languages. In this paper, we propose an Embedded Heterogeneous\nAttention Transformer (EHAT) to build reasoning paths bridging cross-domain for\ncross-lingual image captioning and integrate into transformer. The proposed\nEHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous\nAttention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN\nas the core network, models and infers cross-domain relationship anchored by\nvision bounding box representation features to connect two languages word\nfeatures and learn the heterogeneous maps. MHCA and HCA implement cross-domain\nintegration in the encoder through the special heterogeneous attention and\nenable single model to generate two language captioning. We test on MSCOCO\ndataset to generate English and Chinese, which are most widely used and have\nobvious difference between their language families. Our experiments show that\nour method even achieve better than advanced monolingual methods.\n","authors":["Zijie Song","Zhenzhen Hu","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2307.09915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03238v3","updated":"2023-07-19T11:20:12Z","published":"2023-05-05T01:40:00Z","title":"Reduction of Class Activation Uncertainty with Background Information","summary":" Multitask learning is a popular approach to training high-performing neural\nnetworks with improved generalization. In this paper, we propose a background\nclass to achieve improved generalization at a lower computation compared to\nmultitask learning to help researchers and organizations with limited\ncomputation power. We also present a methodology for selecting background\nimages and discuss potential future improvements. We apply our approach to\nseveral datasets and achieved improved generalization with much lower\ncomputation. We also investigate class activation mappings (CAMs) of the\ntrained model and observed the tendency towards looking at a bigger picture in\na few class classification problems with the proposed model training\nmethodology. Applying transformer with the proposed background class, we\nreceive state-of-the-art (SOTA) performance on STL-10, Caltech-101, and\nCINIC-10 datasets. Example scripts are available in the `CAM' folder of the\nfollowing GitHub Repository: github.com/dipuk0506/UQ\n","authors":["H M Dipu Kabir"],"pdf_url":"https://arxiv.org/pdf/2305.03238v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09906v1","updated":"2023-07-19T11:10:26Z","published":"2023-07-19T11:10:26Z","title":"Implicit Identity Representation Conditioned Memory Compensation Network\n for Talking Head video Generation","summary":" Talking head video generation aims to animate a human face in a still image\nwith dynamic poses and expressions using motion information derived from a\ntarget-driving video, while maintaining the person's identity in the source\nimage. However, dramatic and complex motions in the driving video cause\nambiguous generation, because the still source image cannot provide sufficient\nappearance information for occluded regions or delicate expression variations,\nwhich produces severe artifacts and significantly degrades the generation\nquality. To tackle this problem, we propose to learn a global facial\nrepresentation space, and design a novel implicit identity representation\nconditioned memory compensation network, coined as MCNet, for high-fidelity\ntalking head generation.~Specifically, we devise a network module to learn a\nunified spatial facial meta-memory bank from all training samples, which can\nprovide rich facial structure and appearance priors to compensate warped source\nfacial features for the generation. Furthermore, we propose an effective query\nmechanism based on implicit identity representations learned from the discrete\nkeypoints of the source image. It can greatly facilitate the retrieval of more\ncorrelated information from the memory bank for the compensation. Extensive\nexperiments demonstrate that MCNet can learn representative and complementary\nfacial memory, and can clearly outperform previous state-of-the-art talking\nhead generation methods on VoxCeleb1 and CelebV datasets. Please check our\n\\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}.\n","authors":["Fa-Ting Hong","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09906v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.09211v3","updated":"2023-07-19T10:52:30Z","published":"2023-05-16T06:40:04Z","title":"CB-HVTNet: A channel-boosted hybrid vision transformer network for\n lymphocyte assessment in histopathological images","summary":" Transformers, due to their ability to learn long range dependencies, have\novercome the shortcomings of convolutional neural networks (CNNs) for global\nperspective learning. Therefore, they have gained the focus of researchers for\nseveral vision related tasks including medical diagnosis. However, their\nmulti-head attention module only captures global level feature representations,\nwhich is insufficient for medical images. To address this issue, we propose a\nChannel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning\nto generate boosted channels and employs both transformers and CNNs to analyse\nlymphocytes in histopathological images. The proposed CB HVT comprises five\nmodules, including a channel generation module, channel exploitation module,\nchannel merging module, region-aware module, and a detection and segmentation\nhead, which work together to effectively identify lymphocytes. The channel\ngeneration module uses the idea of channel boosting through transfer learning\nto extract diverse channels from different auxiliary learners. In the CB HVT,\nthese boosted channels are first concatenated and ranked using an attention\nmechanism in the channel exploitation module. A fusion block is then utilized\nin the channel merging module for a gradual and systematic merging of the\ndiverse boosted channels to improve the network's learning representations. The\nCB HVT also employs a proposal network in its region aware module and a head to\neffectively identify objects, even in overlapping regions and with artifacts.\nWe evaluated the proposed CB HVT on two publicly available datasets for\nlymphocyte assessment in histopathological images. The results show that CB HVT\noutperformed other state of the art detection models, and has good\ngeneralization ability, demonstrating its value as a tool for pathologists.\n","authors":["Momina Liaqat Ali","Zunaira Rauf","Asifullah Khan","Anabia Sohail","Rafi Ullah","Jeonghwan Gwak"],"pdf_url":"https://arxiv.org/pdf/2305.09211v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09893v1","updated":"2023-07-19T10:45:49Z","published":"2023-07-19T10:45:49Z","title":"Learning from Abstract Images: on the Importance of Occlusion in a\n Minimalist Encoding of Human Poses","summary":" Existing 2D-to-3D pose lifting networks suffer from poor performance in\ncross-dataset benchmarks. Although the use of 2D keypoints joined by\n\"stick-figure\" limbs has shown promise as an intermediate step, stick-figures\ndo not account for occlusion information that is often inherent in an image. In\nthis paper, we propose a novel representation using opaque 3D limbs that\npreserves occlusion information while implicitly encoding joint locations.\nCrucially, when training on data with accurate three-dimensional keypoints and\nwithout part-maps, this representation allows training on abstract synthetic\nimages, with occlusion, from as many synthetic viewpoints as desired. The\nresult is a pose defined by limb angles rather than joint positions\n$\\unicode{x2013}$ because poses are, in the real world, independent of cameras\n$\\unicode{x2013}$ allowing us to predict poses that are completely independent\nof camera viewpoint. The result provides not only an improvement in\nsame-dataset benchmarks, but a \"quantum leap\" in cross-dataset benchmarks.\n","authors":["Saad Manzur","Wayne Hayes"],"pdf_url":"https://arxiv.org/pdf/2307.09893v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.09892v1","updated":"2023-07-19T10:44:44Z","published":"2023-07-19T10:44:44Z","title":"3Deformer: A Common Framework for Image-Guided Mesh Deformation","summary":" We propose 3Deformer, a general-purpose framework for interactive 3D shape\nediting. Given a source 3D mesh with semantic materials, and a user-specified\nsemantic image, 3Deformer can accurately edit the source mesh following the\nshape guidance of the semantic image, while preserving the source topology as\nrigid as possible. Recent studies of 3D shape editing mostly focus on learning\nneural networks to predict 3D shapes, which requires high-cost 3D training\ndatasets and is limited to handling objects involved in the datasets. Unlike\nthese studies, our 3Deformer is a non-training and common framework, which only\nrequires supervision of readily-available semantic images, and is compatible\nwith editing various objects unlimited by datasets. In 3Deformer, the source\nmesh is deformed utilizing the differentiable renderer technique, according to\nthe correspondences between semantic images and mesh materials. However,\nguiding complex 3D shapes with a simple 2D image incurs extra challenges, that\nis, the deform accuracy, surface smoothness, geometric rigidity, and global\nsynchronization of the edited mesh should be guaranteed. To address these\nchallenges, we propose a hierarchical optimization architecture to balance the\nglobal and local shape features, and propose further various strategies and\nlosses to improve properties of accuracy, smoothness, rigidity, and so on.\nExtensive experiments show that our 3Deformer is able to produce impressive\nresults and reaches the state-of-the-art level.\n","authors":["Hao Su","Xuefeng Liu","Jianwei Niu","Ji Wan","Xinghao Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09886v1","updated":"2023-07-19T10:31:35Z","published":"2023-07-19T10:31:35Z","title":"A reinforcement learning approach for VQA validation: an application to\n diabetic macular edema grading","summary":" Recent advances in machine learning models have greatly increased the\nperformance of automated methods in medical image analysis. However, the\ninternal functioning of such models is largely hidden, which hinders their\nintegration in clinical practice. Explainability and trust are viewed as\nimportant aspects of modern methods, for the latter's widespread use in\nclinical communities. As such, validation of machine learning models represents\nan important aspect and yet, most methods are only validated in a limited way.\nIn this work, we focus on providing a richer and more appropriate validation\napproach for highly powerful Visual Question Answering (VQA) algorithms. To\nbetter understand the performance of these methods, which answer arbitrary\nquestions related to images, this work focuses on an automatic visual Turing\ntest (VTT). That is, we propose an automatic adaptive questioning method, that\naims to expose the reasoning behavior of a VQA algorithm. Specifically, we\nintroduce a reinforcement learning (RL) agent that observes the history of\npreviously asked questions, and uses it to select the next question to pose. We\ndemonstrate our approach in the context of evaluating algorithms that\nautomatically answer questions related to diabetic macular edema (DME) grading.\nThe experiments show that such an agent has similar behavior to a clinician,\nwhereby asking questions that are relevant to key clinical concepts.\n","authors":["Tatiana Fountoukidou","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2307.09886v1.pdf","comment":"16 pages (+ 23 pages supplementary material)"},{"id":"http://arxiv.org/abs/2307.09880v1","updated":"2023-07-19T10:23:28Z","published":"2023-07-19T10:23:28Z","title":"A3D: Adaptive, Accurate, and Autonomous Navigation for Edge-Assisted\n Drones","summary":" Accurate navigation is of paramount importance to ensure flight safety and\nefficiency for autonomous drones. Recent research starts to use Deep Neural\nNetworks to enhance drone navigation given their remarkable predictive\ncapability for visual perception. However, existing solutions either run DNN\ninference tasks on drones in situ, impeded by the limited onboard resource, or\noffload the computation to external servers which may incur large network\nlatency. Few works consider jointly optimizing the offloading decisions along\nwith image transmission configurations and adapting them on the fly. In this\npaper, we propose A3D, an edge server assisted drone navigation framework that\ncan dynamically adjust task execution location, input resolution, and image\ncompression ratio in order to achieve low inference latency, high prediction\naccuracy, and long flight distances. Specifically, we first augment\nstate-of-the-art convolutional neural networks for drone navigation and define\na novel metric called Quality of Navigation as our optimization objective which\ncan effectively capture the above goals. We then design a deep reinforcement\nlearning based neural scheduler at the drone side for which an information\nencoder is devised to reshape the state features and thus improve its learning\nability. To further support simultaneous multi-drone serving, we extend the\nedge server design by developing a network-aware resource allocation algorithm,\nwhich allows provisioning containerized resources aligned with drones' demand.\nWe finally implement a proof-of-concept prototype with realistic devices and\nvalidate its performance in a real-world campus scene, as well as a simulation\nenvironment for thorough evaluation upon AirSim. Extensive experimental results\nshow that A3D can reduce end-to-end latency by 28.06% and extend the flight\ndistance by up to 27.28% compared with non-adaptive solutions.\n","authors":["Liekang Zeng","Haowei Chen","Daipeng Feng","Xiaoxi Zhang","Xu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09880v1.pdf","comment":"Accepted by IEEE/ACM Transactions on Networking"},{"id":"http://arxiv.org/abs/2304.06403v2","updated":"2023-07-19T10:12:58Z","published":"2023-04-13T11:10:16Z","title":"Leveraging triplet loss for unsupervised action segmentation","summary":" In this paper, we propose a novel fully unsupervised framework that learns\naction representations suitable for the action segmentation task from the\nsingle input video itself, without requiring any training data. Our method is a\ndeep metric learning approach rooted in a shallow network with a triplet loss\noperating on similarity distributions and a novel triplet selection strategy\nthat effectively models temporal and semantic priors to discover actions in the\nnew representational space. Under these circumstances, we successfully recover\ntemporal boundaries in the learned action representations with higher quality\ncompared with existing unsupervised approaches. The proposed method is\nevaluated on two widely used benchmark datasets for the action segmentation\ntask and it achieves competitive performance by applying a generic clustering\nalgorithm on the learned representations.\n","authors":["E. Bueno-Benito","B. Tura","M. Dimiccoli"],"pdf_url":"https://arxiv.org/pdf/2304.06403v2.pdf","comment":"Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) Workshops, 2023, pp. 4921-4929"},{"id":"http://arxiv.org/abs/2304.05417v2","updated":"2023-07-19T10:01:29Z","published":"2023-04-11T18:00:02Z","title":"The MONET dataset: Multimodal drone thermal dataset recorded in rural\n scenarios","summary":" We present MONET, a new multimodal dataset captured using a thermal camera\nmounted on a drone that flew over rural areas, and recorded human and vehicle\nactivities. We captured MONET to study the problem of object localisation and\nbehaviour understanding of targets undergoing large-scale variations and being\nrecorded from different and moving viewpoints. Target activities occur in two\ndifferent land sites, each with unique scene structures and cluttered\nbackgrounds. MONET consists of approximately 53K images featuring 162K manually\nannotated bounding boxes. Each image is timestamp-aligned with drone metadata\nthat includes information about attitudes, speed, altitude, and GPS\ncoordinates. MONET is different from previous thermal drone datasets because it\nfeatures multimodal data, including rural scenes captured with thermal cameras\ncontaining both person and vehicle targets, along with trajectory information\nand metadata. We assessed the difficulty of the dataset in terms of transfer\nlearning between the two sites and evaluated nine object detection algorithms\nto identify the open challenges associated with this type of data. Project\npage: https://github.com/fabiopoiesi/monet_dataset.\n","authors":["Luigi Riz","Andrea Caraffa","Matteo Bortolon","Mohamed Lamine Mekhalfi","Davide Boscaini","André Moura","José Antunes","André Dias","Hugo Silva","Andreas Leonidou","Christos Constantinides","Christos Keleshis","Dante Abate","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2304.05417v2.pdf","comment":"Published in Computer Vision and Pattern Recognition (CVPR) Workshops\n 2023 - 6th Multimodal Learning and Applications Workshop"},{"id":"http://arxiv.org/abs/2307.09861v1","updated":"2023-07-19T09:45:06Z","published":"2023-07-19T09:45:06Z","title":"BSDM: Background Suppression Diffusion Model for Hyperspectral Anomaly\n Detection","summary":" Hyperspectral anomaly detection (HAD) is widely used in Earth observation and\ndeep space exploration. A major challenge for HAD is the complex background of\nthe input hyperspectral images (HSIs), resulting in anomalies confused in the\nbackground. On the other hand, the lack of labeled samples for HSIs leads to\npoor generalization of existing HAD methods. This paper starts the first\nattempt to study a new and generalizable background learning problem without\nlabeled samples. We present a novel solution BSDM (background suppression\ndiffusion model) for HAD, which can simultaneously learn latent background\ndistributions and generalize to different datasets for suppressing complex\nbackground. It is featured in three aspects: (1) For the complex background of\nHSIs, we design pseudo background noise and learn the potential background\ndistribution in it with a diffusion model (DM). (2) For the generalizability\nproblem, we apply a statistical offset module so that the BSDM adapts to\ndatasets of different domains without labeling samples. (3) For achieving\nbackground suppression, we innovatively improve the inference process of DM by\nfeeding the original HSIs into the denoising network, which removes the\nbackground as noise. Our work paves a new background suppression way for HAD\nthat can improve HAD performance without the prerequisite of manually labeled\ndata. Assessments and generalization experiments of four HAD methods on several\nreal HSI datasets demonstrate the above three unique properties of the proposed\nmethod. The code is available at https://github.com/majitao-xd/BSDM-HAD.\n","authors":["Jitao Ma","Weiying Xie","Yunsong Li","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2307.09861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09857v1","updated":"2023-07-19T09:36:08Z","published":"2023-07-19T09:36:08Z","title":"Blind Image Quality Assessment Using Multi-Stream Architecture with\n Spatial and Channel Attention","summary":" BIQA (Blind Image Quality Assessment) is an important field of study that\nevaluates images automatically. Although significant progress has been made,\nblind image quality assessment remains a difficult task since images vary in\ncontent and distortions. Most algorithms generate quality without emphasizing\nthe important region of interest. In order to solve this, a multi-stream\nspatial and channel attention-based algorithm is being proposed. This algorithm\ngenerates more accurate predictions with a high correlation to human perceptual\nassessment by combining hybrid features from two different backbones, followed\nby spatial and channel attention to provide high weights to the region of\ninterest. Four legacy image quality assessment datasets are used to validate\nthe effectiveness of our proposed approach. Authentic and synthetic distortion\nimage databases are used to demonstrate the effectiveness of the proposed\nmethod, and we show that it has excellent generalization properties with a\nparticular focus on the perceptual foreground information.\n","authors":["Hassan Khalid","Nisar Ahmed"],"pdf_url":"https://arxiv.org/pdf/2307.09857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02203v3","updated":"2023-07-19T09:34:22Z","published":"2023-07-05T10:54:50Z","title":"Neural Fields for Interactive Visualization of Statistical Dependencies\n in 3D Simulation Ensembles","summary":" We present the first neural network that has learned to compactly represent\nand can efficiently reconstruct the statistical dependencies between the values\nof physical variables at different spatial locations in large 3D simulation\nensembles. Going beyond linear dependencies, we consider mutual information as\na measure of non-linear dependence. We demonstrate learning and reconstruction\nwith a large weather forecast ensemble comprising 1000 members, each storing\nmultiple physical variables at a 250 x 352 x 20 simulation grid. By\ncircumventing compute-intensive statistical estimators at runtime, we\ndemonstrate significantly reduced memory and computation requirements for\nreconstructing the major dependence structures. This enables embedding the\nestimator into a GPU-accelerated direct volume renderer and interactively\nvisualizing all mutual dependencies for a selected domain point.\n","authors":["Fatemeh Farokhmanesh","Kevin Höhlein","Christoph Neuhauser","Rüdiger Westermann"],"pdf_url":"https://arxiv.org/pdf/2307.02203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09856v1","updated":"2023-07-19T09:30:00Z","published":"2023-07-19T09:30:00Z","title":"Hierarchical Spatio-Temporal Representation Learning for Gait\n Recognition","summary":" Gait recognition is a biometric technique that identifies individuals by\ntheir unique walking styles, which is suitable for unconstrained environments\nand has a wide range of applications. While current methods focus on exploiting\nbody part-based representations, they often neglect the hierarchical\ndependencies between local motion patterns. In this paper, we propose a\nhierarchical spatio-temporal representation learning (HSTL) framework for\nextracting gait features from coarse to fine. Our framework starts with a\nhierarchical clustering analysis to recover multi-level body structures from\nthe whole body to local details. Next, an adaptive region-based motion\nextractor (ARME) is designed to learn region-independent motion features. The\nproposed HSTL then stacks multiple ARMEs in a top-down manner, with each ARME\ncorresponding to a specific partition level of the hierarchy. An adaptive\nspatio-temporal pooling (ASTP) module is used to capture gait features at\ndifferent levels of detail to perform hierarchical feature mapping. Finally, a\nframe-level temporal aggregation (FTA) module is employed to reduce redundant\ninformation in gait sequences through multi-scale temporal downsampling.\nExtensive experiments on CASIA-B, OUMVLP, GREW, and Gait3D datasets demonstrate\nthat our method outperforms the state-of-the-art while maintaining a reasonable\nbalance between model accuracy and complexity.\n","authors":["Lei Wang","Bo Liu","Fangfang Liang","Bincheng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09856v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.00574v2","updated":"2023-07-19T09:27:14Z","published":"2023-07-02T13:57:45Z","title":"Bidirectional Temporal Diffusion Model for Temporally Consistent Human\n Animation","summary":" We introduce a method to generate temporally coherent human animation from a\nsingle image, a video, or a random noise. This problem has been formulated as\nmodeling of an auto-regressive generation, i.e., to regress past frames to\ndecode future frames. However, such unidirectional generation is highly prone\nto motion drifting over time, generating unrealistic human animation with\nsignificant artifacts such as appearance distortion. We claim that\nbidirectional temporal modeling enforces temporal coherence on a generative\nnetwork by largely suppressing the motion ambiguity of human appearance. To\nprove our claim, we design a novel human animation framework using a denoising\ndiffusion model: a neural network learns to generate the image of a person by\ndenoising temporal Gaussian noises whose intermediate results are\ncross-conditioned bidirectionally between consecutive frames. In the\nexperiments, our method demonstrates strong performance compared to existing\nunidirectional approaches with realistic temporal coherence\n","authors":["Tserendorj Adiya","Sanghun Kim","Jung Eun Lee","Jae Shin Yoon","Hwasup Lim"],"pdf_url":"https://arxiv.org/pdf/2307.00574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v2","updated":"2023-07-19T09:23:43Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n Transferability From Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v2.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n pages, 12 figures, 13 tables"},{"id":"http://arxiv.org/abs/2208.10741v3","updated":"2023-07-19T09:15:05Z","published":"2022-08-23T05:27:32Z","title":"Hierarchically Decomposed Graph Convolutional Networks for\n Skeleton-Based Action Recognition","summary":" Graph convolutional networks (GCNs) are the most commonly used methods for\nskeleton-based action recognition and have achieved remarkable performance.\nGenerating adjacency matrices with semantically meaningful edges is\nparticularly important for this task, but extracting such edges is challenging\nproblem. To solve this, we propose a hierarchically decomposed graph\nconvolutional network (HD-GCN) architecture with a novel hierarchically\ndecomposed graph (HD-Graph). The proposed HD-GCN effectively decomposes every\njoint node into several sets to extract major structurally adjacent and distant\nedges, and uses them to construct an HD-Graph containing those edges in the\nsame semantic spaces of a human skeleton. In addition, we introduce an\nattention-guided hierarchy aggregation (A-HA) module to highlight the dominant\nhierarchical edge sets of the HD-Graph. Furthermore, we apply a new six-way\nensemble method, which uses only joint and bone stream without any motion\nstream. The proposed model is evaluated and achieves state-of-the-art\nperformance on four large, popular datasets. Finally, we demonstrate the\neffectiveness of our model with various comparative experiments.\n","authors":["Jungho Lee","Minhyeok Lee","Dogyoon Lee","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2208.10741v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.06689v2","updated":"2023-07-19T09:09:42Z","published":"2023-07-13T11:21:58Z","title":"YOLIC: An Efficient Method for Object Localization and Classification on\n Edge Devices","summary":" In the realm of Tiny AI, we introduce \"You Only Look at Interested Cells\"\n(YOLIC), an efficient method for object localization and classification on edge\ndevices. Seamlessly blending the strengths of semantic segmentation and object\ndetection, YOLIC offers superior computational efficiency and precision. By\nadopting Cells of Interest for classification instead of individual pixels,\nYOLIC encapsulates relevant information, reduces computational load, and\nenables rough object shape inference. Importantly, the need for bounding box\nregression is obviated, as YOLIC capitalizes on the predetermined cell\nconfiguration that provides information about potential object location, size,\nand shape. To tackle the issue of single-label classification limitations, a\nmulti-label classification approach is applied to each cell, effectively\nrecognizing overlapping or closely situated objects. This paper presents\nextensive experiments on multiple datasets, demonstrating that YOLIC achieves\ndetection performance comparable to the state-of-the-art YOLO algorithms while\nsurpassing in speed, exceeding 30fps on a Raspberry Pi 4B CPU. All resources\nrelated to this study, including datasets, cell designer, image annotation\ntool, and source code, have been made publicly available on our project website\nat https://kai3316.github.io/yolic.github.io\n","authors":["Kai Su","Qiangfu Zhao","Yoichi Tomioka","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.06689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09847v1","updated":"2023-07-19T09:09:24Z","published":"2023-07-19T09:09:24Z","title":"Cryo-forum: A framework for orientation recovery with uncertainty\n measure with the application in cryo-EM image analysis","summary":" In single-particle cryo-electron microscopy (cryo-EM), the efficient\ndetermination of orientation parameters for 2D projection images poses a\nsignificant challenge yet is crucial for reconstructing 3D structures. This\ntask is complicated by the high noise levels present in the cryo-EM datasets,\nwhich often include outliers, necessitating several time-consuming 2D clean-up\nprocesses. Recently, solutions based on deep learning have emerged, offering a\nmore streamlined approach to the traditionally laborious task of orientation\nestimation. These solutions often employ amortized inference, eliminating the\nneed to estimate parameters individually for each image. However, these methods\nfrequently overlook the presence of outliers and may not adequately concentrate\non the components used within the network. This paper introduces a novel\napproach that uses a 10-dimensional feature vector to represent the orientation\nand applies a Quadratically-Constrained Quadratic Program to derive the\npredicted orientation as a unit quaternion, supplemented by an uncertainty\nmetric. Furthermore, we propose a unique loss function that considers the\npairwise distances between orientations, thereby enhancing the accuracy of our\nmethod. Finally, we also comprehensively evaluate the design choices involved\nin constructing the encoder network, a topic that has not received sufficient\nattention in the literature. Our numerical analysis demonstrates that our\nmethodology effectively recovers orientations from 2D cryo-EM images in an\nend-to-end manner. Importantly, the inclusion of uncertainty quantification\nallows for direct clean-up of the dataset at the 3D level. Lastly, we package\nour proposed methods into a user-friendly software suite named cryo-forum,\ndesigned for easy accessibility by the developers.\n","authors":["Szu-Chi Chung"],"pdf_url":"https://arxiv.org/pdf/2307.09847v1.pdf","comment":"27 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.09841v1","updated":"2023-07-19T08:55:39Z","published":"2023-07-19T08:55:39Z","title":"Compressive Image Scanning Microscope","summary":" We present a novel approach to implement compressive sensing in laser\nscanning microscopes (LSM), specifically in image scanning microscopy (ISM),\nusing a single-photon avalanche diode (SPAD) array detector. Our method\naddresses two significant limitations in applying compressive sensing to LSM:\nthe time to compute the sampling matrix and the quality of reconstructed\nimages. We employ a fixed sampling strategy, skipping alternate rows and\ncolumns during data acquisition, which reduces the number of points scanned by\na factor of four and eliminates the need to compute different sampling\nmatrices. By exploiting the parallel images generated by the SPAD array, we\nimprove the quality of the reconstructed compressive-ISM images compared to\nstandard compressive confocal LSM images. Our results demonstrate the\neffectiveness of our approach in producing higher-quality images with reduced\ndata acquisition time and potential benefits in reducing photobleaching.\n","authors":["Ajay Gunalan","Marco Castello","Simonluca Piazza","Shunlei Li","Alberto Diaspro","Leonardo S. Mattos","Paolo Bianchini"],"pdf_url":"https://arxiv.org/pdf/2307.09841v1.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2111.01396v2","updated":"2023-07-19T08:55:05Z","published":"2021-11-02T06:58:22Z","title":"Boundary Distribution Estimation for Precise Object Detection","summary":" In the field of state-of-the-art object detection, the task of object\nlocalization is typically accomplished through a dedicated subnet that\nemphasizes bounding box regression. This subnet traditionally predicts the\nobject's position by regressing the box's center position and scaling factors.\nDespite the widespread adoption of this approach, we have observed that the\nlocalization results often suffer from defects, leading to unsatisfactory\ndetector performance. In this paper, we address the shortcomings of previous\nmethods through theoretical analysis and experimental verification and present\nan innovative solution for precise object detection. Instead of solely focusing\non the object's center and size, our approach enhances the accuracy of bounding\nbox localization by refining the box edges based on the estimated distribution\nat the object's boundary. Experimental results demonstrate the potential and\ngeneralizability of our proposed method.\n","authors":["Peng Zhi","Haoran Zhou","Hang Huang","Rui Zhao","Rui Zhou","Qingguo Zhou"],"pdf_url":"https://arxiv.org/pdf/2111.01396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09340v2","updated":"2023-07-19T08:55:01Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":" Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v2.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2210.06551v3","updated":"2023-07-19T08:54:27Z","published":"2022-10-12T19:46:25Z","title":"MotionBERT: A Unified Perspective on Learning Human Motion\n Representations","summary":" We present a unified perspective on tackling various human-centric video\ntasks by learning human motion representations from large-scale and\nheterogeneous data resources. Specifically, we propose a pretraining stage in\nwhich a motion encoder is trained to recover the underlying 3D motion from\nnoisy partial 2D observations. The motion representations acquired in this way\nincorporate geometric, kinematic, and physical knowledge about human motion,\nwhich can be easily transferred to multiple downstream tasks. We implement the\nmotion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)\nneural network. It could capture long-range spatio-temporal relationships among\nthe skeletal joints comprehensively and adaptively, exemplified by the lowest\n3D pose estimation error so far when trained from scratch. Furthermore, our\nproposed framework achieves state-of-the-art performance on all three\ndownstream tasks by simply finetuning the pretrained motion encoder with a\nsimple regression head (1-2 layers), which demonstrates the versatility of the\nlearned motion representations. Code and models are available at\nhttps://motionbert.github.io/\n","authors":["Wentao Zhu","Xiaoxuan Ma","Zhaoyang Liu","Libin Liu","Wayne Wu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2210.06551v3.pdf","comment":"ICCV 2023 version"},{"id":"http://arxiv.org/abs/2307.09829v1","updated":"2023-07-19T08:34:25Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n shortcut perspective","summary":" Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2307.09827v1","updated":"2023-07-19T08:32:59Z","published":"2023-07-19T08:32:59Z","title":"Online Continual Learning for Robust Indoor Object Recognition","summary":" Vision systems mounted on home robots need to interact with unseen classes in\nchanging environments. Robots have limited computational resources, labelled\ndata and storage capability. These requirements pose some unique challenges:\nmodels should adapt without forgetting past knowledge in a data- and\nparameter-efficient way. We characterize the problem as few-shot (FS) online\ncontinual learning (OCL), where robotic agents learn from a non-repeated stream\nof few-shot data updating only a few model parameters. Additionally, such\nmodels experience variable conditions at test time, where objects may appear in\ndifferent poses (e.g., horizontal or vertical) and environments (e.g., day or\nnight). To improve robustness of CL agents, we propose RobOCLe, which; 1)\nconstructs an enriched feature space computing high order statistical moments\nfrom the embedded features of samples; and 2) computes similarity between high\norder statistics of the samples on the enriched feature space, and predicts\ntheir class labels. We evaluate robustness of CL models to train/test\naugmentations in various cases. We show that different moments allow RobOCLe to\ncapture different properties of deformations, providing higher robustness with\nno decrease of inference speed.\n","authors":["Umberto Michieli","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.09827v1.pdf","comment":"IROS 2023"},{"id":"http://arxiv.org/abs/2307.09416v2","updated":"2023-07-19T08:27:50Z","published":"2023-07-18T16:33:30Z","title":"Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation\n Evaluation","summary":" Research in Image Generation has recently made significant progress,\nparticularly boosted by the introduction of Vision-Language models which are\nable to produce high-quality visual content based on textual inputs. Despite\nongoing advancements in terms of generation quality and realism, no methodical\nframeworks have been defined yet to quantitatively measure the quality of the\ngenerated content and the adherence with the prompted requests: so far, only\nhuman-based evaluations have been adopted for quality satisfaction and for\ncomparing different generative methods. We introduce a novel automated method\nfor Visual Concept Evaluation (ViCE), i.e. to assess consistency between a\ngenerated/edited image and the corresponding prompt/instructions, with a\nprocess inspired by the human cognitive behaviour. ViCE combines the strengths\nof Large Language Models (LLMs) and Visual Question Answering (VQA) into a\nunified pipeline, aiming to replicate the human cognitive process in quality\nassessment. This method outlines visual concepts, formulates image-specific\nverification questions, utilizes the Q&A system to investigate the image, and\nscores the combined outcome. Although this brave new hypothesis of mimicking\nhumans in the image evaluation process is in its preliminary assessment stage,\nresults are promising and open the door to a new form of automatic evaluation\nwhich could have significant impact as the image generation or the image target\nediting tasks become more and more sophisticated.\n","authors":["Federico Betti","Jacopo Staiano","Lorenzo Baraldi","Lorenzo Baraldi","Rita Cucchiara","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2307.09416v2.pdf","comment":"Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track)"},{"id":"http://arxiv.org/abs/2205.11397v5","updated":"2023-07-19T08:25:37Z","published":"2022-05-23T15:42:12Z","title":"Super Vision Transformer","summary":" We attempt to reduce the computational costs in vision transformers (ViTs),\nwhich increase quadratically in the token number. We present a novel training\nparadigm that trains only one ViT model at a time, but is capable of providing\nimproved image recognition performance with various computational costs. Here,\nthe trained ViT model, termed super vision transformer (SuperViT), is empowered\nwith the versatile ability to solve incoming patches of multiple sizes as well\nas preserve informative tokens with multiple keeping rates (the ratio of\nkeeping tokens) to achieve good hardware efficiency for inference, given that\nthe available hardware resources often change from time to time. Experimental\nresults on ImageNet demonstrate that our SuperViT can considerably reduce the\ncomputational costs of ViT models with even performance increase. For example,\nwe reduce 2x FLOPs of DeiT-S while increasing the Top-1 accuracy by 0.2% and\n0.7% for 1.5x reduction. Also, our SuperViT significantly outperforms existing\nstudies on efficient vision transformers. For example, when consuming the same\namount of FLOPs, our SuperViT surpasses the recent state-of-the-art (SOTA) EViT\nby 1.1% when using DeiT-S as their backbones. The project of this work is made\npublicly available at https://github.com/lmbxmu/SuperViT.\n","authors":["Mingbao Lin","Mengzhao Chen","Yuxin Zhang","Chunhua Shen","Rongrong Ji","Liujuan Cao"],"pdf_url":"https://arxiv.org/pdf/2205.11397v5.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV) in the\n year of 2023"},{"id":"http://arxiv.org/abs/2307.09823v1","updated":"2023-07-19T08:21:01Z","published":"2023-07-19T08:21:01Z","title":"Multi-modal Learning based Prediction for Disease","summary":" Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic\nliver disease, which can be predicted accurately to prevent advanced fibrosis\nand cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is\ninvasive, expensive, and prone to sampling errors. Therefore, non-invasive\nstudies are extremely promising, yet they are still in their infancy due to the\nlack of comprehensive research data and intelligent methods for multi-modal\ndata. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a\ncomprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD\nprediction method (DeepFLD). The dataset includes over 6000 participants\nphysical examinations, laboratory and imaging studies, extensive\nquestionnaires, and facial images of partial participants, which is\ncomprehensive and valuable for clinical studies. From the dataset, we\nquantitatively analyze and select clinical metadata that most contribute to\nNAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network\nmodel designed to predict NAFLD using multi-modal input, including metadata and\nfacial images, outperforms the approach that only uses metadata. Satisfactory\nperformance is also verified on other unseen datasets. Inspiringly, DeepFLD can\nachieve competitive results using only facial images as input rather than\nmetadata, paving the way for a more robust and simpler non-invasive NAFLD\ndiagnosis.\n","authors":["Yaran Chen","Xueyu Chen","Yu Han","Haoran Li","Dongbin Zhao","Jingzhong Li","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08096v2","updated":"2023-07-19T08:19:58Z","published":"2023-03-14T17:33:39Z","title":"MELON: NeRF with Unposed Images in SO(3)","summary":" Neural radiance fields enable novel-view synthesis and scene reconstruction\nwith photorealistic quality from a few images, but require known and accurate\ncamera poses. Conventional pose estimation algorithms fail on smooth or\nself-similar scenes, while methods performing inverse rendering from unposed\nviews require a rough initialization of the camera orientations. The main\ndifficulty of pose estimation lies in real-life objects being almost invariant\nunder certain transformations, making the photometric distance between rendered\nviews non-convex with respect to the camera parameters. Using an equivalence\nrelation that matches the distribution of local minima in camera space, we\nreduce this space to its quotient set, in which pose estimation becomes a more\nconvex problem. Using a neural-network to regularize pose estimation, we\ndemonstrate that our method - MELON - can reconstruct a neural radiance field\nfrom unposed images with state-of-the-art accuracy while requiring ten times\nfewer views than adversarial approaches.\n","authors":["Axel Levy","Mark Matthews","Matan Sela","Gordon Wetzstein","Dmitry Lagun"],"pdf_url":"https://arxiv.org/pdf/2303.08096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09822v1","updated":"2023-07-19T08:19:08Z","published":"2023-07-19T08:19:08Z","title":"A Siamese-based Verification System for Open-set Architecture\n Attribution of Synthetic Images","summary":" Despite the wide variety of methods developed for synthetic image\nattribution, most of them can only attribute images generated by models or\narchitectures included in the training set and do not work with unknown\narchitectures, hindering their applicability in real-world scenarios. In this\npaper, we propose a verification framework that relies on a Siamese Network to\naddress the problem of open-set attribution of synthetic images to the\narchitecture that generated them. We consider two different settings. In the\nfirst setting, the system determines whether two images have been produced by\nthe same generative architecture or not. In the second setting, the system\nverifies a claim about the architecture used to generate a synthetic image,\nutilizing one or multiple reference images generated by the claimed\narchitecture. The main strength of the proposed system is its ability to\noperate in both closed and open-set scenarios so that the input images, either\nthe query and reference images, can belong to the architectures considered\nduring training or not. Experimental evaluations encompassing various\ngenerative architectures such as GANs, diffusion models, and transformers,\nfocusing on synthetic face image generation, confirm the excellent performance\nof our method in both closed and open-set settings, as well as its strong\ngeneralization capabilities.\n","authors":["Lydia Abady","Jun Wang","Benedetta Tondi","Mauro Barni"],"pdf_url":"https://arxiv.org/pdf/2307.09822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09821v1","updated":"2023-07-19T08:16:34Z","published":"2023-07-19T08:16:34Z","title":"Hierarchical Semantic Perceptual Listener Head Video Generation: A\n High-performance Pipeline","summary":" In dyadic speaker-listener interactions, the listener's head reactions along\nwith the speaker's head movements, constitute an important non-verbal semantic\nexpression together. The listener Head generation task aims to synthesize\nresponsive listener's head videos based on audios of the speaker and reference\nimages of the listener. Compared to the Talking-head generation, it is more\nchallenging to capture the correlation clues from the speaker's audio and\nvisual information. Following the ViCo baseline scheme, we propose a\nhigh-performance solution by enhancing the hierarchical semantic extraction\ncapability of the audio encoder module and improving the decoder part, renderer\nand post-processing modules. Our solution gets the first place on the official\nleaderboard for the track of listening head generation. This paper is a\ntechnical report of ViCo@2023 Conversational Head Generation Challenge in ACM\nMultimedia 2023 conference.\n","authors":["Zhigang Chang","Weitai Hu","Qing Yang","Shibao Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.09821v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.09818v1","updated":"2023-07-19T08:06:37Z","published":"2023-07-19T08:06:37Z","title":"Deep unrolling Shrinkage Network for Dynamic MR imaging","summary":" Deep unrolling networks that utilize sparsity priors have achieved great\nsuccess in dynamic magnetic resonance (MR) imaging. The convolutional neural\nnetwork (CNN) is usually utilized to extract the transformed domain, and then\nthe soft thresholding (ST) operator is applied to the CNN-transformed data to\nenforce the sparsity priors. However, the ST operator is usually constrained to\nbe the same across all channels of the CNN-transformed data. In this paper, we\npropose a novel operator, called soft thresholding with channel attention\n(AST), that learns the threshold for each channel. In particular, we put\nforward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the\nalternating direction method of multipliers (ADMM) for optimizing the\ntransformed $l_1$ norm dynamic MR reconstruction model. Experimental results on\nan open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net\noutperforms the state-of-the-art methods. The source code is available at\n\\url{https://github.com/yhao-z/DUS-Net}.\n","authors":["Yinghao Zhang","Xiaodi Li","Weihang Li","Yue Hu"],"pdf_url":"https://arxiv.org/pdf/2307.09818v1.pdf","comment":"5 pages,3 figures,2 tables"},{"id":"http://arxiv.org/abs/2307.07813v3","updated":"2023-07-19T08:06:34Z","published":"2023-07-15T14:34:25Z","title":"TinyTracker: Ultra-Fast and Ultra-Low-Power Edge Vision In-Sensor for\n Gaze Estimation","summary":" Intelligent edge vision tasks encounter the critical challenge of ensuring\npower and latency efficiency due to the typically heavy computational load they\nimpose on edge platforms.This work leverages one of the first \"AI in sensor\"\nvision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power\nend-to-end edge vision applications. We evaluate the IMX500 and compare it to\nother edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by\nexploring gaze estimation as a case study. We propose TinyTracker, a highly\nefficient, fully quantized model for 2D gaze estimation designed to maximize\nthe performance of the edge vision systems considered in this study.\nTinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1]\nwithout significant loss in gaze estimation accuracy (maximum of 0.16 cm when\nfully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor\nresults in end-to-end latency of around 19ms. The camera takes around 17.9ms to\nread, process and transmit the pixels to the accelerator. The inference time of\nthe network is 0.86ms with an additional 0.24 ms for retrieving the results\nfrom the sensor. The overall energy consumption of the end-to-end system is 4.9\nmJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is\n1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ\nVS 34.2mJ)\n","authors":["Pietro Bonazzi","Thomas Ruegg","Sizhen Bian","Yawei Li","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2307.07813v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09815v1","updated":"2023-07-19T08:03:53Z","published":"2023-07-19T08:03:53Z","title":"LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network","summary":" Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent\nblur is a challenging task.~Existing blur map-based deblurring methods have\ndemonstrated promising results. In this paper, we propose, to the best of our\nknowledge, the first framework to introduce the contrastive language-image\npre-training framework (CLIP) to achieve accurate blur map estimation from DP\npairs unsupervisedly. To this end, we first carefully design text prompts to\nenable CLIP to understand blur-related geometric prior knowledge from the DP\npair. Then, we propose a format to input stereo DP pair to the CLIP without any\nfine-tuning, where the CLIP is pre-trained on monocular images. Given the\nestimated blur map, we introduce a blur-prior attention block, a blur-weighting\nloss and a blur-aware loss to recover the all-in-focus image. Our method\nachieves state-of-the-art performance in extensive experiments.\n","authors":["Hao Yang","Liyuan Pan","Yan Yang","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09810v1","updated":"2023-07-19T07:58:21Z","published":"2023-07-19T07:58:21Z","title":"GenKL: An Iterative Framework for Resolving Label Ambiguity and Label\n Non-conformity in Web Images Via a New Generalized KL Divergence","summary":" Web image datasets curated online inherently contain ambiguous\nin-distribution (ID) instances and out-of-distribution (OOD) instances, which\nwe collectively call non-conforming (NC) instances. In many recent approaches\nfor mitigating the negative effects of NC instances, the core implicit\nassumption is that the NC instances can be found via entropy maximization. For\n\"entropy\" to be well-defined, we are interpreting the output prediction vector\nof an instance as the parameter vector of a multinomial random variable, with\nrespect to some trained model with a softmax output layer. Hence, entropy\nmaximization is based on the idealized assumption that NC instances have\npredictions that are \"almost\" uniformly distributed. However, in real-world web\nimage datasets, there are numerous NC instances whose predictions are far from\nbeing uniformly distributed. To tackle the limitation of entropy maximization,\nwe propose $(\\alpha, \\beta)$-generalized KL divergence,\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$, which can be used to identify\nsignificantly more NC instances. Theoretical properties of\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$ are proven, and we also show\nempirically that a simple use of $\\mathcal{D}_{\\text{KL}}^{\\alpha,\n\\beta}(p\\|q)$ outperforms all baselines on the NC instance identification task.\nBuilding upon $(\\alpha,\\beta)$-generalized KL divergence, we also introduce a\nnew iterative training framework, GenKL, that identifies and relabels NC\ninstances. When evaluated on three web image datasets, Clothing1M,\nFood101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art\nclassification accuracies: $81.34\\%$, $85.73\\%$ and $78.99\\%$/$92.54\\%$\n(top-1/top-5), respectively.\n","authors":["Xia Huang","Kai Fong Ernest Chong"],"pdf_url":"https://arxiv.org/pdf/2307.09810v1.pdf","comment":"Published (with open access) at International Journal of Computer\n Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at:\n https://github.com/codetopaper/GenKL"},{"id":"http://arxiv.org/abs/2307.09804v1","updated":"2023-07-19T07:47:23Z","published":"2023-07-19T07:47:23Z","title":"Fix your downsampling ASAP! Be natively more robust via Aliasing and\n Spectral Artifact free Pooling","summary":" Convolutional neural networks encode images through a sequence of\nconvolutions, normalizations and non-linearities as well as downsampling\noperations into potentially strong semantic embeddings. Yet, previous work\nshowed that even slight mistakes during sampling, leading to aliasing, can be\ndirectly attributed to the networks' lack in robustness. To address such issues\nand facilitate simpler and faster adversarial training, [12] recently proposed\nFLC pooling, a method for provably alias-free downsampling - in theory. In this\nwork, we conduct a further analysis through the lens of signal processing and\nfind that such current pooling methods, which address aliasing in the frequency\ndomain, are still prone to spectral leakage artifacts. Hence, we propose\naliasing and spectral artifact-free pooling, short ASAP. While only introducing\na few modifications to FLC pooling, networks using ASAP as downsampling method\nexhibit higher native robustness against common corruptions, a property that\nFLC pooling was missing. ASAP also increases native robustness against\nadversarial attacks on high and low resolution data while maintaining similar\nclean accuracy or even outperforming the baseline.\n","authors":["Julia Grabinski","Janis Keuper","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.09804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08723v2","updated":"2023-07-19T07:35:54Z","published":"2023-07-17T11:19:41Z","title":"Revisiting Scene Text Recognition: A Data Perspective","summary":" This paper aims to re-assess scene text recognition (STR) from a\ndata-oriented perspective. We begin by revisiting the six commonly used\nbenchmarks in STR and observe a trend of performance saturation, whereby only\n2.91% of the benchmark images cannot be accurately recognized by an ensemble of\n13 representative models. While these results are impressive and suggest that\nSTR could be considered solved, however, we argue that this is primarily due to\nthe less challenging nature of the common benchmarks, thus concealing the\nunderlying issues that STR faces. To this end, we consolidate a large-scale\nreal STR dataset, namely Union14M, which comprises 4 million labeled images and\n10 million unlabeled images, to assess the performance of STR models in more\ncomplex real-world scenarios. Our experiments demonstrate that the 13 models\ncan only achieve an average accuracy of 66.53% on the 4 million labeled images,\nindicating that STR still faces numerous challenges in the real world. By\nanalyzing the error patterns of the 13 models, we identify seven open\nchallenges in STR and develop a challenge-driven benchmark consisting of eight\ndistinct subsets to facilitate further progress in the field. Our exploration\ndemonstrates that STR is far from being solved and leveraging data may be a\npromising solution. In this regard, we find that utilizing the 10 million\nunlabeled images through self-supervised pre-training can significantly improve\nthe robustness of STR model in real-world scenarios and leads to\nstate-of-the-art performance.\n","authors":["Qing Jiang","Jiapeng Wang","Dezhi Peng","Chongyu Liu","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2307.08723v2.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2210.16117v4","updated":"2023-07-19T07:34:37Z","published":"2022-10-28T13:25:59Z","title":"Improving the Transferability of Adversarial Attacks on Face Recognition\n with Beneficial Perturbation Feature Augmentation","summary":" Face recognition (FR) models can be easily fooled by adversarial examples,\nwhich are crafted by adding imperceptible perturbations on benign face images.\nThe existence of adversarial face examples poses a great threat to the security\nof society. In order to build a more sustainable digital nation, in this paper,\nwe improve the transferability of adversarial face examples to expose more\nblind spots of existing FR models. Though generating hard samples has shown its\neffectiveness in improving the generalization of models in training tasks, the\neffectiveness of utilizing this idea to improve the transferability of\nadversarial face examples remains unexplored. To this end, based on the\nproperty of hard samples and the symmetry between training tasks and\nadversarial attack tasks, we propose the concept of hard models, which have\nsimilar effects as hard samples for adversarial attack tasks. Utilizing the\nconcept of hard models, we propose a novel attack method called Beneficial\nPerturbation Feature Augmentation Attack (BPFA), which reduces the overfitting\nof adversarial examples to surrogate FR models by constantly generating new\nhard models to craft the adversarial examples. Specifically, in the\nbackpropagation, BPFA records the gradients on pre-selected feature maps and\nuses the gradient on the input image to craft the adversarial example. In the\nnext forward propagation, BPFA leverages the recorded gradients to add\nbeneficial perturbations on their corresponding feature maps to increase the\nloss. Extensive experiments demonstrate that BPFA can significantly boost the\ntransferability of adversarial attacks on FR.\n","authors":["Fengfan Zhou","Hefei Ling","Yuxuan Shi","Jiazhong Chen","Zongyi Li","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2210.16117v4.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2302.05086v3","updated":"2023-07-19T07:31:35Z","published":"2023-02-10T07:08:13Z","title":"Making Substitute Models More Bayesian Can Enhance Transferability of\n Adversarial Examples","summary":" The transferability of adversarial examples across deep neural networks\n(DNNs) is the crux of many black-box attacks. Many prior efforts have been\ndevoted to improving the transferability via increasing the diversity in inputs\nof some substitute models. In this paper, by contrast, we opt for the diversity\nin substitute models and advocate to attack a Bayesian model for achieving\ndesirable transferability. Deriving from the Bayesian formulation, we develop a\nprincipled strategy for possible finetuning, which can be combined with many\noff-the-shelf Gaussian posterior approximations over DNN parameters. Extensive\nexperiments have been conducted to verify the effectiveness of our method, on\ncommon benchmark datasets, and the results demonstrate that our method\noutperforms recent state-of-the-arts by large margins (roughly 19% absolute\nincrease in average attack success rate on ImageNet), and, by combining with\nthese recent methods, further performance gain can be obtained. Our code:\nhttps://github.com/qizhangli/MoreBayesian-attack.\n","authors":["Qizhang Li","Yiwen Guo","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2302.05086v3.pdf","comment":"Accepted by ICLR 2023, fix typos"},{"id":"http://arxiv.org/abs/2307.09795v1","updated":"2023-07-19T07:29:14Z","published":"2023-07-19T07:29:14Z","title":"From West to East: Who can understand the music of the others better?","summary":" Recent developments in MIR have led to several benchmark deep learning models\nwhose embeddings can be used for a variety of downstream tasks. At the same\ntime, the vast majority of these models have been trained on Western pop/rock\nmusic and related styles. This leads to research questions on whether these\nmodels can be used to learn representations for different music cultures and\nstyles, or whether we can build similar music audio embedding models trained on\ndata from different cultures or styles. To that end, we leverage transfer\nlearning methods to derive insights about the similarities between the\ndifferent music cultures to which the data belongs to. We use two Western music\ndatasets, two traditional/folk datasets coming from eastern Mediterranean\ncultures, and two datasets belonging to Indian art music. Three deep audio\nembedding models are trained and transferred across domains, including two\nCNN-based and a Transformer-based architecture, to perform auto-tagging for\neach target domain dataset. Experimental results show that competitive\nperformance is achieved in all domains via transfer learning, while the best\nsource dataset varies for each music culture. The implementation and the\ntrained models are both provided in a public repository.\n","authors":["Charilaos Papaioannou","Emmanouil Benetos","Alexandros Potamianos"],"pdf_url":"https://arxiv.org/pdf/2307.09795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09794v1","updated":"2023-07-19T07:25:33Z","published":"2023-07-19T07:25:33Z","title":"DiffDP: Radiotherapy Dose Prediction via a Diffusion Model","summary":" Currently, deep learning (DL) has achieved the automatic prediction of dose\ndistribution in radiotherapy planning, enhancing its efficiency and quality.\nHowever, existing methods suffer from the over-smoothing problem for their\ncommonly used L_1 or L_2 loss with posterior average calculations. To alleviate\nthis limitation, we innovatively introduce a diffusion-based dose prediction\n(DiffDP) model for predicting the radiotherapy dose distribution of cancer\npatients. Specifically, the DiffDP model contains a forward process and a\nreverse process. In the forward process, DiffDP gradually transforms dose\ndistribution maps into Gaussian noise by adding small noise and trains a noise\npredictor to predict the noise added in each timestep. In the reverse process,\nit removes the noise from the original Gaussian noise in multiple steps with\nthe well-trained noise predictor and finally outputs the predicted dose\ndistribution map. To ensure the accuracy of the prediction, we further design a\nstructure encoder to extract anatomical information from patient anatomy images\nand enable the noise predictor to be aware of the dose constraints within\nseveral essential organs, i.e., the planning target volume and organs at risk.\nExtensive experiments on an in-house dataset with 130 rectum cancer patients\ndemonstrate the s\n","authors":["Zhenghao Feng","Lu Wen","Peng Wang","Binyu Yan","Xi Wu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09794v1.pdf","comment":"to be published in MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.08015v2","updated":"2023-07-19T07:18:12Z","published":"2023-07-16T11:52:27Z","title":"Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via\n Geometry-Guided Cross-View Transformer","summary":" Image retrieval-based cross-view localization methods often lead to very\ncoarse camera pose estimation, due to the limited sampling density of the\ndatabase satellite images. In this paper, we propose a method to increase the\naccuracy of a ground camera's location and orientation by estimating the\nrelative rotation and translation between the ground-level image and its\nmatched/retrieved satellite image. Our approach designs a geometry-guided\ncross-view transformer that combines the benefits of conventional geometry and\nlearnable cross-view transformers to map the ground-view observations to an\noverhead view. Given the synthesized overhead view and observed satellite\nfeature maps, we construct a neural pose optimizer with strong global\ninformation embedding ability to estimate the relative rotation between them.\nAfter aligning their rotations, we develop an uncertainty-guided spatial\ncorrelation to generate a probability map of the vehicle locations, from which\nthe relative translation can be determined. Experimental results demonstrate\nthat our method significantly outperforms the state-of-the-art. Notably, the\nlikelihood of restricting the vehicle lateral pose to be within 1m of its\nGround Truth (GT) value on the cross-view KITTI dataset has been improved from\n$35.54\\%$ to $76.44\\%$, and the likelihood of restricting the vehicle\norientation to be within $1^{\\circ}$ of its GT value has been improved from\n$19.64\\%$ to $99.10\\%$.\n","authors":["Yujiao Shi","Fei Wu","Ankit Vora","Akhil Perincherry","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2307.08015v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09788v1","updated":"2023-07-19T07:11:45Z","published":"2023-07-19T07:11:45Z","title":"Density-invariant Features for Distant Point Cloud Registration","summary":" Registration of distant outdoor LiDAR point clouds is crucial to extending\nthe 3D vision of collaborative autonomous vehicles, and yet is challenging due\nto small overlapping area and a huge disparity between observed point\ndensities. In this paper, we propose Group-wise Contrastive Learning (GCL)\nscheme to extract density-invariant geometric features to register distant\noutdoor LiDAR point clouds. We mark through theoretical analysis and\nexperiments that, contrastive positives should be independent and identically\ndistributed (i.i.d.), in order to train densityinvariant feature extractors. We\npropose upon the conclusion a simple yet effective training scheme to force the\nfeature of multiple point clouds in the same spatial location (referred to as\npositive groups) to be similar, which naturally avoids the sampling bias\nintroduced by a pair of point clouds to conform with the i.i.d. principle. The\nresulting fully-convolutional feature extractor is more powerful and\ndensity-invariant than state-of-the-art methods, improving the registration\nrecall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and\n26.9%, respectively. The code will be open-sourced.\n","authors":["Quan Liu","Hongzi Zhu","Yunsong Zhou","Hongyang Li","Shan Chang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2307.09788v1.pdf","comment":"In Proceedings of the IEEE/CVF International Conference on Computer\n Vision (ICCV), 2023"},{"id":"http://arxiv.org/abs/2307.09787v1","updated":"2023-07-19T07:11:11Z","published":"2023-07-19T07:11:11Z","title":"DVPT: Dynamic Visual Prompt Tuning of Large Pre-trained Models for\n Medical Image Analysis","summary":" Limited labeled data makes it hard to train models from scratch in medical\ndomain, and an important paradigm is pre-training and then fine-tuning. Large\npre-trained models contain rich representations, which can be adapted to\ndownstream medical tasks. However, existing methods either tune all the\nparameters or the task-specific layers of the pre-trained models, ignoring the\ninput variations of medical images, and thus they are not efficient or\neffective. In this work, we aim to study parameter-efficient fine-tuning (PEFT)\nfor medical image analysis, and propose a dynamic visual prompt tuning method,\nnamed DVPT. It can extract knowledge beneficial to downstream tasks from large\nmodels with a few trainable parameters. Firstly, the frozen features are\ntransformed by an lightweight bottleneck layer to learn the domain-specific\ndistribution of downstream medical tasks, and then a few learnable visual\nprompts are used as dynamic queries and then conduct cross-attention with the\ntransformed features, attempting to acquire sample-specific knowledge that are\nsuitable for each sample. Finally, the features are projected to original\nfeature dimension and aggregated with the frozen features. This DVPT module can\nbe shared between different Transformer layers, further reducing the trainable\nparameters. To validate DVPT, we conduct extensive experiments with different\npre-trained models on medical classification and segmentation tasks. We find\nsuch PEFT method can not only efficiently adapt the pre-trained models to the\nmedical domain, but also brings data efficiency with partial labeled data. For\nexample, with 0.5\\% extra trainable parameters, our method not only outperforms\nstate-of-the-art PEFT methods, even surpasses the full fine-tuning by more than\n2.20\\% Kappa score on medical classification task. It can saves up to 60\\%\nlabeled data and 99\\% storage cost of ViT-B/16.\n","authors":["Along He","Kai Wang","Zhihong Wang","Tao Li","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2307.09787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09781v1","updated":"2023-07-19T06:56:07Z","published":"2023-07-19T06:56:07Z","title":"Text2Layer: Layered Image Generation using Latent Diffusion Model","summary":" Layer compositing is one of the most popular image editing workflows among\nboth amateurs and professionals. Motivated by the success of diffusion models,\nwe explore layer compositing from a layered image generation perspective.\nInstead of generating an image, we propose to generate background, foreground,\nlayer mask, and the composed image simultaneously. To achieve layered image\ngeneration, we train an autoencoder that is able to reconstruct layered images\nand train diffusion models on the latent representation. One benefit of the\nproposed problem is to enable better compositing workflows in addition to the\nhigh-quality image output. Another benefit is producing higher-quality layer\nmasks compared to masks produced by a separate step of image segmentation.\nExperimental results show that the proposed method is able to generate\nhigh-quality layered images and initiates a benchmark for future work.\n","authors":["Xinyang Zhang","Wentian Zhao","Xin Lu","Jeff Chien"],"pdf_url":"https://arxiv.org/pdf/2307.09781v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2307.01533v2","updated":"2023-07-19T06:39:36Z","published":"2023-07-04T07:36:48Z","title":"Unsupervised Video Anomaly Detection with Diffusion Models Conditioned\n on Compact Motion Representations","summary":" This paper aims to address the unsupervised video anomaly detection (VAD)\nproblem, which involves classifying each frame in a video as normal or\nabnormal, without any access to labels. To accomplish this, the proposed method\nemploys conditional diffusion models, where the input data is the\nspatiotemporal features extracted from a pre-trained network, and the condition\nis the features extracted from compact motion representations that summarize a\ngiven video segment in terms of its motion and appearance. Our method utilizes\na data-driven threshold and considers a high reconstruction error as an\nindicator of anomalous events. This study is the first to utilize compact\nmotion representations for VAD and the experiments conducted on two large-scale\nVAD benchmarks demonstrate that they supply relevant information to the\ndiffusion model, and consequently improve VAD performances w.r.t the prior art.\nImportantly, our method exhibits better generalization performance across\ndifferent datasets, notably outperforming both the state-of-the-art and\nbaseline methods. The code of our method is available at\nhttps://github.com/AnilOsmanTur/conditioned_video_anomaly_diffusion\n","authors":["Anil Osman Tur","Nicola Dall'Asen","Cigdem Beyan","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2307.01533v2.pdf","comment":"Accepted to ICIAP 2023"},{"id":"http://arxiv.org/abs/2307.09769v1","updated":"2023-07-19T06:07:12Z","published":"2023-07-19T06:07:12Z","title":"Source-Free Domain Adaptation for Medical Image Segmentation via\n Prototype-Anchored Feature Alignment and Contrastive Learning","summary":" Unsupervised domain adaptation (UDA) has increasingly gained interests for\nits capacity to transfer the knowledge learned from a labeled source domain to\nan unlabeled target domain. However, typical UDA methods require concurrent\naccess to both the source and target domain data, which largely limits its\napplication in medical scenarios where source data is often unavailable due to\nprivacy concern. To tackle the source data-absent problem, we present a novel\ntwo-stage source-free domain adaptation (SFDA) framework for medical image\nsegmentation, where only a well-trained source segmentation model and unlabeled\ntarget data are available during domain adaptation. Specifically, in the\nprototype-anchored feature alignment stage, we first utilize the weights of the\npre-trained pixel-wise classifier as source prototypes, which preserve the\ninformation of source features. Then, we introduce the bi-directional transport\nto align the target features with class prototypes by minimizing its expected\ncost. On top of that, a contrastive learning stage is further devised to\nutilize those pixels with unreliable predictions for a more compact target\nfeature distribution. Extensive experiments on a cross-modality medical\nsegmentation task demonstrate the superiority of our method in large domain\ndiscrepancy settings compared with the state-of-the-art SFDA approaches and\neven some UDA methods. Code is available at\nhttps://github.com/CSCYQJ/MICCAI23-ProtoContra-SFDA.\n","authors":["Qinji Yu","Nan Xi","Junsong Yuan","Ziyu Zhou","Kang Dang","Xiaowei Ding"],"pdf_url":"https://arxiv.org/pdf/2307.09769v1.pdf","comment":"Accepted by MICCAI23"},{"id":"http://arxiv.org/abs/2009.06205v3","updated":"2023-07-19T06:05:27Z","published":"2020-09-14T05:23:58Z","title":"Joint Demosaicking and Denoising Benefits from a Two-stage Training\n Strategy","summary":" Image demosaicking and denoising are the first two key steps of the color\nimage production pipeline. The classical processing sequence has for a long\ntime consisted of applying denoising first, and then demosaicking. Applying the\noperations in this order leads to oversmoothing and checkerboard effects. Yet,\nit was difficult to change this order, because once the image is demosaicked,\nthe statistical properties of the noise are dramatically changed and hard to\nhandle by traditional denoising models. In this paper, we address this problem\nby a hybrid machine learning method. We invert the traditional color filter\narray (CFA) processing pipeline by first demosaicking and then denoising. Our\ndemosaicking algorithm, trained on noiseless images, combines a traditional\nmethod and a residual convolutional neural network (CNN). This first stage\nretains all known information, which is the key point to obtain faithful final\nresults. The noisy demosaicked image is then passed through a second CNN\nrestoring a noiseless full-color image. This pipeline order completely avoids\ncheckerboard effects and restores fine image detail. Although CNNs can be\ntrained to solve jointly demosaicking-denoising end-to-end, we find that this\ntwo-stage training performs better and is less prone to failure. It is shown\nexperimentally to improve on the state of the art, both quantitatively and in\nterms of visual quality.\n","authors":["Yu Guo","Qiyu Jin","Gabriele Facciolo","Tieyong Zeng","Jean-Michel Morel"],"pdf_url":"https://arxiv.org/pdf/2009.06205v3.pdf","comment":"28 pages, 40 figures"},{"id":"http://arxiv.org/abs/2307.09763v1","updated":"2023-07-19T05:46:56Z","published":"2023-07-19T05:46:56Z","title":"Towards Building More Robust Models with Frequency Bias","summary":" The vulnerability of deep neural networks to adversarial samples has been a\nmajor impediment to their broad applications, despite their success in various\nfields. Recently, some works suggested that adversarially-trained models\nemphasize the importance of low-frequency information to achieve higher\nrobustness. While several attempts have been made to leverage this frequency\ncharacteristic, they have all faced the issue that applying low-pass filters\ndirectly to input images leads to irreversible loss of discriminative\ninformation and poor generalizability to datasets with distinct frequency\nfeatures. This paper presents a plug-and-play module called the Frequency\nPreference Control Module that adaptively reconfigures the low- and\nhigh-frequency components of intermediate feature representations, providing\nbetter utilization of frequency in robust learning. Empirical studies show that\nour proposed module can be easily incorporated into any adversarial training\nframework, further improving model robustness across different architectures\nand datasets. Additionally, experiments were conducted to examine how the\nfrequency bias of robust models impacts the adversarial training process and\nits final robustness, revealing interesting insights.\n","authors":["Qingwen Bu","Dong Huang","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09763v1.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2307.08779v2","updated":"2023-07-19T05:43:45Z","published":"2023-07-17T18:50:15Z","title":"Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation","summary":" Low-light conditions not only hamper human visual experience but also degrade\nthe model's performance on downstream vision tasks. While existing works make\nremarkable progress on day-night domain adaptation, they rely heavily on domain\nknowledge derived from the task-specific nighttime dataset. This paper\nchallenges a more complicated scenario with border applicability, i.e.,\nzero-shot day-night domain adaptation, which eliminates reliance on any\nnighttime data. Unlike prior zero-shot adaptation approaches emphasizing either\nimage-level translation or model-level adaptation, we propose a similarity\nmin-max paradigm that considers them under a unified framework. On the image\nlevel, we darken images towards minimum feature similarity to enlarge the\ndomain gap. Then on the model level, we maximize the feature similarity between\nthe darkened images and their normal-light counterparts for better model\nadaptation. To the best of our knowledge, this work represents the pioneering\neffort in jointly optimizing both aspects, resulting in a significant\nimprovement of model generalizability. Extensive experiments demonstrate our\nmethod's effectiveness and broad applicability on various nighttime vision\ntasks, including classification, semantic segmentation, visual place\nrecognition, and video action recognition. Code and pre-trained models are\navailable at https://red-fairy.github.io/ZeroShotDayNightDA-Webpage/.\n","authors":["Rundong Luo","Wenjing Wang","Wenhan Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2307.08779v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09758v1","updated":"2023-07-19T05:41:14Z","published":"2023-07-19T05:41:14Z","title":"Longitudinal Data and a Semantic Similarity Reward for Chest X-Ray\n Report Generation","summary":" Chest X-Ray (CXR) report generation is a promising approach to improving the\nefficiency of CXR interpretation. However, a significant increase in diagnostic\naccuracy is required before that can be realised. Motivated by this, we propose\na framework that is more inline with a radiologist's workflow by considering\nlongitudinal data. Here, the decoder is additionally conditioned on the report\nfrom the subject's previous imaging study via a prompt. We also propose a new\nreward for reinforcement learning based on CXR-BERT, which computes the\nsimilarity between reports. We conduct experiments on the MIMIC-CXR dataset.\nThe results indicate that longitudinal data improves CXR report generation.\nCXR-BERT is also shown to be a promising alternative to the current\nstate-of-the-art reward based on RadGraph. This investigation indicates that\nlongitudinal CXR report generation can offer a substantial increase in\ndiagnostic accuracy. Our Hugging Face model is available at:\nhttps://huggingface.co/aehrc/cxrmate and code is available at:\nhttps://github.com/aehrc/cxrmate.\n","authors":["Aaron Nicolson","Jason Dowling","Bevan Koopman"],"pdf_url":"https://arxiv.org/pdf/2307.09758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09756v1","updated":"2023-07-19T05:40:38Z","published":"2023-07-19T05:40:38Z","title":"Generative Prompt Model for Weakly Supervised Object Localization","summary":" Weakly supervised object localization (WSOL) remains challenging when\nlearning object localization models from image category labels. Conventional\nmethods that discriminatively train activation models ignore representative yet\nless discriminative object parts. In this study, we propose a generative prompt\nmodel (GenPromp), defining the first generative pipeline to localize less\ndiscriminative object parts by formulating WSOL as a conditional image\ndenoising procedure. During training, GenPromp converts image category labels\nto learnable prompt embeddings which are fed to a generative model to\nconditionally recover the input image with noise and learn representative\nembeddings. During inference, enPromp combines the representative embeddings\nwith discriminative embeddings (queried from an off-the-shelf vision-language\nmodel) for both representative and discriminative capacity. The combined\nembeddings are finally used to generate multi-scale high-quality attention\nmaps, which facilitate localizing full object extent. Experiments on\nCUB-200-2011 and ILSVRC show that GenPromp respectively outperforms the best\ndiscriminative models by 5.2% and 5.6% (Top-1 Loc), setting a solid baseline\nfor WSOL with the generative model. Code is available at\nhttps://github.com/callsys/GenPromp.\n","authors":["Yuzhong Zhao","Qixiang Ye","Weijia Wu","Chunhua Shen","Fang Wan"],"pdf_url":"https://arxiv.org/pdf/2307.09756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09755v1","updated":"2023-07-19T05:39:15Z","published":"2023-07-19T05:39:15Z","title":"Space Engage: Collaborative Space Supervision for Contrastive-based\n Semi-Supervised Semantic Segmentation","summary":" Semi-Supervised Semantic Segmentation (S4) aims to train a segmentation model\nwith limited labeled images and a substantial volume of unlabeled images. To\nimprove the robustness of representations, powerful methods introduce a\npixel-wise contrastive learning approach in latent space (i.e., representation\nspace) that aggregates the representations to their prototypes in a fully\nsupervised manner. However, previous contrastive-based S4 methods merely rely\non the supervision from the model's output (logits) in logit space during\nunlabeled training. In contrast, we utilize the outputs in both logit space and\nrepresentation space to obtain supervision in a collaborative way. The\nsupervision from two spaces plays two roles: 1) reduces the risk of\nover-fitting to incorrect semantic information in logits with the help of\nrepresentations; 2) enhances the knowledge exchange between the two spaces.\nFurthermore, unlike previous approaches, we use the similarity between\nrepresentations and prototypes as a new indicator to tilt training those\nunder-performing representations and achieve a more efficient contrastive\nlearning process. Results on two public benchmarks demonstrate the competitive\nperformance of our method compared with state-of-the-art methods.\n","authors":["Changqi Wang","Haoyu Xie","Yuhui Yuan","Chong Fu","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.09755v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09749v1","updated":"2023-07-19T05:08:47Z","published":"2023-07-19T05:08:47Z","title":"Towards Robust Scene Text Image Super-resolution via Explicit Location\n Enhancement","summary":" Scene text image super-resolution (STISR), aiming to improve image quality\nwhile boosting downstream scene text recognition accuracy, has recently\nachieved great success. However, most existing methods treat the foreground\n(character regions) and background (non-character regions) equally in the\nforward process, and neglect the disturbance from the complex background, thus\nlimiting the performance. To address these issues, in this paper, we propose a\nnovel method LEMMA that explicitly models character regions to produce\nhigh-level text-specific guidance for super-resolution. To model the location\nof characters effectively, we propose the location enhancement module to\nextract character region features based on the attention map sequence. Besides,\nwe propose the multi-modal alignment module to perform bidirectional\nvisual-semantic alignment to generate high-quality prior guidance, which is\nthen incorporated into the super-resolution branch in an adaptive manner using\nthe proposed adaptive fusion module. Experiments on TextZoom and four scene\ntext recognition benchmarks demonstrate the superiority of our method over\nother state-of-the-art methods. Code is available at\nhttps://github.com/csguoh/LEMMA.\n","authors":["Hang Guo","Tao Dai","Guanghao Meng","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2307.09749v1.pdf","comment":"Accepted as IJCAI2023 paper"},{"id":"http://arxiv.org/abs/2307.09748v1","updated":"2023-07-19T04:59:58Z","published":"2023-07-19T04:59:58Z","title":"Watch out Venomous Snake Species: A Solution to SnakeCLEF2023","summary":" The SnakeCLEF2023 competition aims to the development of advanced algorithms\nfor snake species identification through the analysis of images and\naccompanying metadata. This paper presents a method leveraging utilization of\nboth images and metadata. Modern CNN models and strong data augmentation are\nutilized to learn better representation of images. To relieve the challenge of\nlong-tailed distribution, seesaw loss is utilized in our method. We also design\na light model to calculate prior probabilities using metadata features\nextracted from CLIP in post processing stage. Besides, we attach more\nimportance to venomous species by assigning venomous species labels to some\nexamples that model is uncertain about. Our method achieves 91.31% score of the\nfinal metric combined of F1 and other metrics on private leaderboard, which is\nthe 1st place among the participators. The code is available at\nhttps://github.com/xiaoxsparraw/CLEF2023.\n","authors":["Feiran Hu","Peng Wang","Yangyang Li","Chenlong Duan","Zijian Zhu","Fei Wang","Faen Zhang","Yong Li","Xiu-Shen Wei"],"pdf_url":"https://arxiv.org/pdf/2307.09748v1.pdf","comment":"This work was the winner solution of the SnakeCLEF2023 challenge"},{"id":"http://arxiv.org/abs/2307.09742v1","updated":"2023-07-19T04:07:33Z","published":"2023-07-19T04:07:33Z","title":"Improved Distribution Matching for Dataset Condensation","summary":" Dataset Condensation aims to condense a large dataset into a smaller one\nwhile maintaining its ability to train a well-performing model, thus reducing\nthe storage cost and training effort in deep learning applications. However,\nconventional dataset condensation methods are optimization-oriented and\ncondense the dataset by performing gradient or parameter matching during model\noptimization, which is computationally intensive even on small datasets and\nmodels. In this paper, we propose a novel dataset condensation method based on\ndistribution matching, which is more efficient and promising. Specifically, we\nidentify two important shortcomings of naive distribution matching (i.e.,\nimbalanced feature numbers and unvalidated embeddings for distance computation)\nand address them with three novel techniques (i.e., partitioning and expansion\naugmentation, efficient and enriched model sampling, and class-aware\ndistribution regularization). Our simple yet effective method outperforms most\nprevious optimization-oriented methods with much fewer computational resources,\nthereby scaling data condensation to larger datasets and models. Extensive\nexperiments demonstrate the effectiveness of our method. Codes are available at\nhttps://github.com/uitrbn/IDM\n","authors":["Ganlong Zhao","Guanbin Li","Yipeng Qin","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2307.09742v1.pdf","comment":"CVPR2023"},{"id":"http://arxiv.org/abs/2306.13074v3","updated":"2023-07-19T03:46:37Z","published":"2023-06-22T17:47:08Z","title":"Iterative Scale-Up ExpansionIoU and Deep Features Association for\n Multi-Object Tracking in Sports","summary":" Multi-object tracking algorithms have made significant advancements due to\nthe recent developments in object detection. However, most existing methods\nprimarily focus on tracking pedestrians or vehicles, which exhibit relatively\nsimple and regular motion patterns. Consequently, there is a scarcity of\nalgorithms that address the tracking of targets with irregular or non-linear\nmotion, such as multi-athlete tracking. Furthermore, popular tracking\nalgorithms often rely on the Kalman filter for object motion modeling, which\nfails to track objects when their motion contradicts the linear motion\nassumption of the Kalman filter. Due to this reason, we proposed a novel online\nand robust multi-object tracking approach, named Iterative Scale-Up\nExpansionIoU and Deep Features for multi-object tracking. Unlike conventional\nmethods, we abandon the use of the Kalman filter and propose utilizing the\niterative scale-up expansion IoU. This approach achieves superior tracking\nperformance without requiring additional training data or adopting a more\nrobust detector, all while maintaining a lower computational cost compared to\nother appearance-based methods. Our proposed method demonstrates remarkable\neffectiveness in tracking irregular motion objects, achieving a score of 76.9%\nin HOTA. It outperforms all state-of-the-art tracking algorithms on the\nSportsMOT dataset, covering various kinds of sport scenarios.\n","authors":["Hsiang-Wei Huang","Cheng-Yen Yang","Jiacheng Sun","Jenq-Neng Hwang","Chung-I Huang"],"pdf_url":"https://arxiv.org/pdf/2306.13074v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07859v2","updated":"2023-07-19T03:04:50Z","published":"2023-07-15T17:45:17Z","title":"Unified Adversarial Patch for Cross-modal Attacks in the Physical World","summary":" Recently, physical adversarial attacks have been presented to evade\nDNNs-based object detectors. To ensure the security, many scenarios are\nsimultaneously deployed with visible sensors and infrared sensors, leading to\nthe failures of these single-modal physical attacks. To show the potential\nrisks under such scenes, we propose a unified adversarial patch to perform\ncross-modal physical attacks, i.e., fooling visible and infrared object\ndetectors at the same time via a single patch. Considering different imaging\nmechanisms of visible and infrared sensors, our work focuses on modeling the\nshapes of adversarial patches, which can be captured in different modalities\nwhen they change. To this end, we design a novel boundary-limited shape\noptimization to achieve the compact and smooth shapes, and thus they can be\neasily implemented in the physical world. In addition, to balance the fooling\ndegree between visible detector and infrared detector during the optimization\nprocess, we propose a score-aware iterative evaluation, which can guide the\nadversarial patch to iteratively reduce the predicted scores of the multi-modal\nsensors. We finally test our method against the one-stage detector: YOLOv3 and\nthe two-stage detector: Faster RCNN. Results show that our unified patch\nachieves an Attack Success Rate (ASR) of 73.33% and 69.17%, respectively. More\nimportantly, we verify the effective attacks in the physical world when visible\nand infrared sensors shoot the objects under various settings like different\nangles, distances, postures, and scenes.\n","authors":["Xingxing Wei","Yao Huang","Yitong Sun","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2307.07859v2.pdf","comment":"10 pages, 8 figures, accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2306.16197v3","updated":"2023-07-19T02:53:36Z","published":"2023-06-28T13:23:33Z","title":"Multi-IMU with Online Self-Consistency for Freehand 3D Ultrasound\n Reconstruction","summary":" Ultrasound (US) imaging is a popular tool in clinical diagnosis, offering\nsafety, repeatability, and real-time capabilities. Freehand 3D US is a\ntechnique that provides a deeper understanding of scanned regions without\nincreasing complexity. However, estimating elevation displacement and\naccumulation error remains challenging, making it difficult to infer the\nrelative position using images alone. The addition of external lightweight\nsensors has been proposed to enhance reconstruction performance without adding\ncomplexity, which has been shown to be beneficial. We propose a novel online\nself-consistency network (OSCNet) using multiple inertial measurement units\n(IMUs) to improve reconstruction performance. OSCNet utilizes a modal-level\nself-supervised strategy to fuse multiple IMU information and reduce\ndifferences between reconstruction results obtained from each IMU data.\nAdditionally, a sequence-level self-consistency strategy is proposed to improve\nthe hierarchical consistency of prediction results among the scanning sequence\nand its sub-sequences. Experiments on large-scale arm and carotid datasets with\nmultiple scanning tactics demonstrate that our OSCNet outperforms previous\nmethods, achieving state-of-the-art reconstruction performance.\n","authors":["Mingyuan Luo","Xin Yang","Zhongnuo Yan","Junyu Li","Yuanji Zhang","Jiongquan Chen","Xindi Hu","Jikuan Qian","Jun Cheng","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2306.16197v3.pdf","comment":"Accepted by MICCAI-2023"},{"id":"http://arxiv.org/abs/2307.09732v1","updated":"2023-07-19T02:49:44Z","published":"2023-07-19T02:49:44Z","title":"ClickSeg: 3D Instance Segmentation with Click-Level Weak Annotations","summary":" 3D instance segmentation methods often require fully-annotated dense labels\nfor training, which are costly to obtain. In this paper, we present ClickSeg, a\nnovel click-level weakly supervised 3D instance segmentation method that\nrequires one point per instance annotation merely. Such a problem is very\nchallenging due to the extremely limited labels, which has rarely been solved\nbefore. We first develop a baseline weakly-supervised training method, which\ngenerates pseudo labels for unlabeled data by the model itself. To utilize the\nproperty of click-level annotation setting, we further propose a new training\nframework. Instead of directly using the model inference way, i.e., mean-shift\nclustering, to generate the pseudo labels, we propose to use k-means with fixed\ninitial seeds: the annotated points. New similarity metrics are further\ndesigned for clustering. Experiments on ScanNetV2 and S3DIS datasets show that\nthe proposed ClickSeg surpasses the previous best weakly supervised instance\nsegmentation result by a large margin (e.g., +9.4% mAP on ScanNetV2). Using\n0.02% supervision signals merely, ClickSeg achieves $\\sim$90% of the accuracy\nof the fully-supervised counterpart. Meanwhile, it also achieves\nstate-of-the-art semantic segmentation results among weakly supervised methods\nthat use the same annotation settings.\n","authors":["Leyao Liu","Tao Kong","Minzhao Zhu","Jiashuo Fan","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2307.09732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09729v1","updated":"2023-07-19T02:33:42Z","published":"2023-07-19T02:33:42Z","title":"NTIRE 2023 Quality Assessment of Video Enhancement Challenge","summary":" This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement\nChallenge, which will be held in conjunction with the New Trends in Image\nRestoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to\naddress a major challenge in the field of video processing, namely, video\nquality assessment (VQA) for enhanced videos. The challenge uses the VQA\nDataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211\nenhanced videos, including 600 videos with color, brightness, and contrast\nenhancements, 310 videos with deblurring, and 301 deshaked videos. The\nchallenge has a total of 167 registered participants. 61 participating teams\nsubmitted their prediction results during the development phase, with a total\nof 3168 submissions. A total of 176 submissions were submitted by 37\nparticipating teams during the final testing phase. Finally, 19 participating\nteams submitted their models and fact sheets, and detailed the methods they\nused. Some methods have achieved better results than baseline methods, and the\nwinning methods have demonstrated superior prediction performance.\n","authors":["Xiaohong Liu","Xiongkuo Min","Wei Sun","Yulun Zhang","Kai Zhang","Radu Timofte","Guangtao Zhai","Yixuan Gao","Yuqin Cao","Tengchuan Kou","Yunlong Dong","Ziheng Jia","Yilin Li","Wei Wu","Shuming Hu","Sibin Deng","Pengxiang Xiao","Ying Chen","Kai Li","Kai Zhao","Kun Yuan","Ming Sun","Heng Cong","Hao Wang","Lingzhi Fu","Yusheng Zhang","Rongyu Zhang","Hang Shi","Qihang Xu","Longan Xiao","Zhiliang Ma","Mirko Agarla","Luigi Celona","Claudio Rota","Raimondo Schettini","Zhiwei Huang","Yanan Li","Xiaotao Wang","Lei Lei","Hongye Liu","Wei Hong","Ironhead Chuang","Allen Lin","Drake Guan","Iris Chen","Kae Lou","Willy Huang","Yachun Tasi","Yvonne Kao","Haotian Fan","Fangyuan Kong","Shiqi Zhou","Hao Liu","Yu Lai","Shanshan Chen","Wenqi Wang","Haoning Wu","Chaofeng Chen","Chunzheng Zhu","Zekun Guo","Shiling Zhao","Haibing Yin","Hongkui Wang","Hanene Brachemi Meftah","Sid Ahmed Fezza","Wassim Hamidouche","Olivier Déforges","Tengfei Shi","Azadeh Mansouri","Hossein Motamednia","Amir Hossein Bakhtiari","Ahmad Mahmoudi Aznaveh"],"pdf_url":"https://arxiv.org/pdf/2307.09729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09023v2","updated":"2023-07-19T02:30:48Z","published":"2023-07-18T07:25:38Z","title":"LA-Net: Landmark-Aware Learning for Reliable Facial Expression\n Recognition under Label Noise","summary":" Facial expression recognition (FER) remains a challenging task due to the\nambiguity of expressions. The derived noisy labels significantly harm the\nperformance in real-world scenarios. To address this issue, we present a new\nFER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks\nto mitigate the impact of label noise from two perspectives. Firstly, LA-Net\nuses landmark information to suppress the uncertainty in expression space and\nconstructs the label distribution of each sample by neighborhood aggregation,\nwhich in turn improves the quality of training supervision. Secondly, the model\nincorporates landmark information into expression representations using the\ndevised expression-landmark contrastive loss. The enhanced expression feature\nextractor can be less susceptible to label noise. Our method can be integrated\nwith any deep neural network for better training supervision without\nintroducing extra inference costs. We conduct extensive experiments on both\nin-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net\nachieves state-of-the-art performance.\n","authors":["Zhiyu Wu","Jinshi Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09023v2.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09728v1","updated":"2023-07-19T02:29:57Z","published":"2023-07-19T02:29:57Z","title":"Uncertainty-Driven Multi-Scale Feature Fusion Network for Real-time\n Image Deraining","summary":" Visual-based measurement systems are frequently affected by rainy weather due\nto the degradation caused by rain streaks in captured images, and existing\nimaging devices struggle to address this issue in real-time. While most efforts\nleverage deep networks for image deraining and have made progress, their large\nparameter sizes hinder deployment on resource-constrained devices.\nAdditionally, these data-driven models often produce deterministic results,\nwithout considering their inherent epistemic uncertainty, which can lead to\nundesired reconstruction errors. Well-calibrated uncertainty can help alleviate\nprediction errors and assist measurement devices in mitigating risks and\nimproving usability. Therefore, we propose an Uncertainty-Driven Multi-Scale\nFeature Fusion Network (UMFFNet) that learns the probability mapping\ndistribution between paired images to estimate uncertainty. Specifically, we\nintroduce an uncertainty feature fusion block (UFFB) that utilizes uncertainty\ninformation to dynamically enhance acquired features and focus on blurry\nregions obscured by rain streaks, reducing prediction errors. In addition, to\nfurther boost the performance of UMFFNet, we fused feature information from\nmultiple scales to guide the network for efficient collaborative rain removal.\nExtensive experiments demonstrate that UMFFNet achieves significant performance\nimprovements with few parameters, surpassing state-of-the-art image deraining\nmethods.\n","authors":["Ming Tong","Xuefeng Yan","Yongzhen Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09727v1","updated":"2023-07-19T02:28:41Z","published":"2023-07-19T02:28:41Z","title":"SAMConvex: Fast Discrete Optimization for CT Registration using\n Self-supervised Anatomical Embedding and Correlation Pyramid","summary":" Estimating displacement vector field via a cost volume computed in the\nfeature space has shown great success in image registration, but it suffers\nexcessive computation burdens. Moreover, existing feature descriptors only\nextract local features incapable of representing the global semantic\ninformation, which is especially important for solving large transformations.\nTo address the discussed issues, we propose SAMConvex, a fast coarse-to-fine\ndiscrete optimization method for CT registration that includes a decoupled\nconvex optimization procedure to obtain deformation fields based on a\nself-supervised anatomical embedding (SAM) feature extractor that captures both\nlocal and global information. To be specific, SAMConvex extracts per-voxel\nfeatures and builds 6D correlation volumes based on SAM features, and\niteratively updates a flow field by performing lookups on the correlation\nvolumes with a coarse-to-fine scheme. SAMConvex outperforms the\nstate-of-the-art learning-based methods and optimization-based methods over two\ninter-patient registration datasets (Abdomen CT and HeadNeck CT) and one\nintra-patient registration dataset (Lung CT). Moreover, as an\noptimization-based method, SAMConvex only takes $\\sim2$s ($\\sim5s$ with\ninstance optimization) for one paired images.\n","authors":["Zi Li","Lin Tian","Tony C. W. Mok","Xiaoyu Bai","Puyang Wang","Jia Ge","Jingren Zhou","Le Lu","Xianghua Ye","Ke Yan","Dakai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.09727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09724v1","updated":"2023-07-19T02:26:20Z","published":"2023-07-19T02:26:20Z","title":"AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks","summary":" To deliver the artistic expression of the target style, recent studies\nexploit the attention mechanism owing to its ability to map the local patches\nof the style image to the corresponding patches of the content image. However,\nbecause of the low semantic correspondence between arbitrary content and\nartworks, the attention module repeatedly abuses specific local patches from\nthe style image, resulting in disharmonious and evident repetitive artifacts.\nTo overcome this limitation and accomplish impeccable artistic style transfer,\nwe focus on enhancing the attention mechanism and capturing the rhythm of\npatterns that organize the style. In this paper, we introduce a novel metric,\nnamely pattern repeatability, that quantifies the repetition of patterns in the\nstyle image. Based on the pattern repeatability, we propose Aesthetic\nPattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot\nof local and global style expressions. In addition, we propose a novel\nself-supervisory task to encourage the attention mechanism to learn precise and\nmeaningful semantic correspondence. Lastly, we introduce the patch-wise style\nloss to transfer the elaborate rhythm of local patterns. Through qualitative\nand quantitative evaluations, we verify the reliability of the proposed pattern\nrepeatability that aligns with human perception, and demonstrate the\nsuperiority of the proposed framework.\n","authors":["Kibeom Hong","Seogkyu Jeon","Junsoo Lee","Namhyuk Ahn","Kunhee Kim","Pilhyeon Lee","Daesik Kim","Youngjung Uh","Hyeran Byun"],"pdf_url":"https://arxiv.org/pdf/2307.09724v1.pdf","comment":"Accepted by ICCV 2023. Code is available at this\n https://github.com/Kibeom-Hong/AesPA-Net"},{"id":"http://arxiv.org/abs/2212.04761v2","updated":"2023-07-19T02:20:18Z","published":"2022-12-09T10:37:22Z","title":"Leveraging Spatio-Temporal Dependency for Skeleton-Based Action\n Recognition","summary":" Skeleton-based action recognition has attracted considerable attention due to\nits compact representation of the human body's skeletal sructure. Many recent\nmethods have achieved remarkable performance using graph convolutional networks\n(GCNs) and convolutional neural networks (CNNs), which extract spatial and\ntemporal features, respectively. Although spatial and temporal dependencies in\nthe human skeleton have been explored separately, spatio-temporal dependency is\nrarely considered. In this paper, we propose the Spatio-Temporal Curve Network\n(STC-Net) to effectively leverage the spatio-temporal dependency of the human\nskeleton. Our proposed network consists of two novel elements: 1) The\nSpatio-Temporal Curve (STC) module; and 2) Dilated Kernels for Graph\nConvolution (DK-GC). The STC module dynamically adjusts the receptive field by\nidentifying meaningful node connections between every adjacent frame and\ngenerating spatio-temporal curves based on the identified node connections,\nproviding an adaptive spatio-temporal coverage. In addition, we propose DK-GC\nto consider long-range dependencies, which results in a large receptive field\nwithout any additional parameters by applying an extended kernel to the given\nadjacency matrices of the graph. Our STC-Net combines these two modules and\nachieves state-of-the-art performance on four skeleton-based action recognition\nbenchmarks.\n","authors":["Jungho Lee","Minhyeok Lee","Suhwan Cho","Sungmin Woo","Sungjun Jang","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2212.04761v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09721v1","updated":"2023-07-19T02:11:19Z","published":"2023-07-19T02:11:19Z","title":"Multi-Grained Multimodal Interaction Network for Entity Linking","summary":" Multimodal entity linking (MEL) task, which aims at resolving ambiguous\nmentions to a multimodal knowledge graph, has attracted wide attention in\nrecent years. Though large efforts have been made to explore the complementary\neffect among multiple modalities, however, they may fail to fully absorb the\ncomprehensive expression of abbreviated textual context and implicit visual\nindication. Even worse, the inevitable noisy data may cause inconsistency of\ndifferent modalities during the learning process, which severely degenerates\nthe performance. To address the above issues, in this paper, we propose a novel\nMulti-GraIned Multimodal InteraCtion Network $\\textbf{(MIMIC)}$ framework for\nsolving the MEL task. Specifically, the unified inputs of mentions and entities\nare first encoded by textual/visual encoders separately, to extract global\ndescriptive features and local detailed features. Then, to derive the\nsimilarity matching score for each mention-entity pair, we device three\ninteraction units to comprehensively explore the intra-modal interaction and\ninter-modal fusion among features of entities and mentions. In particular,\nthree modules, namely the Text-based Global-Local interaction Unit (TGLU),\nVision-based DuaL interaction Unit (VDLU) and Cross-Modal Fusion-based\ninteraction Unit (CMFU) are designed to capture and integrate the fine-grained\nrepresentation lying in abbreviated text and implicit visual cues. Afterwards,\nwe introduce a unit-consistency objective function via contrastive learning to\navoid inconsistency and model degradation. Experimental results on three public\nbenchmark datasets demonstrate that our solution outperforms various\nstate-of-the-art baselines, and ablation studies verify the effectiveness of\ndesigned modules.\n","authors":["Pengfei Luo","Tong Xu","Shiwei Wu","Chen Zhu","Linli Xu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09721v1.pdf","comment":"Accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2307.09715v1","updated":"2023-07-19T01:57:31Z","published":"2023-07-19T01:57:31Z","title":"Semantic-Aware Dual Contrastive Learning for Multi-label Image\n Classification","summary":" Extracting image semantics effectively and assigning corresponding labels to\nmultiple objects or attributes for natural images is challenging due to the\ncomplex scene contents and confusing label dependencies. Recent works have\nfocused on modeling label relationships with graph and understanding object\nregions using class activation maps (CAM). However, these methods ignore the\ncomplex intra- and inter-category relationships among specific semantic\nfeatures, and CAM is prone to generate noisy information. To this end, we\npropose a novel semantic-aware dual contrastive learning framework that\nincorporates sample-to-sample contrastive learning (SSCL) as well as\nprototype-to-sample contrastive learning (PSCL). Specifically, we leverage\nsemantic-aware representation learning to extract category-related local\ndiscriminative features and construct category prototypes. Then based on SSCL,\nlabel-level visual representations of the same category are aggregated\ntogether, and features belonging to distinct categories are separated.\nMeanwhile, we construct a novel PSCL module to narrow the distance between\npositive samples and category prototypes and push negative samples away from\nthe corresponding category prototypes. Finally, the discriminative label-level\nfeatures related to the image content are accurately captured by the joint\ntraining of the above three parts. Experiments on five challenging large-scale\npublic datasets demonstrate that our proposed method is effective and\noutperforms the state-of-the-art methods. Code and supplementary materials are\nreleased on https://github.com/yu-gi-oh-leilei/SADCL.\n","authors":["Leilei Ma","Dengdi Sun","Lei Wang","Haifang Zhao","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2307.09715v1.pdf","comment":"8 pages, 6 figures, accepted by ECAI 23"},{"id":"http://arxiv.org/abs/2307.07928v2","updated":"2023-07-19T01:43:59Z","published":"2023-07-16T02:44:19Z","title":"Reinforced Disentanglement for Face Swapping without Skip Connection","summary":" The SOTA face swap models still suffer the problem of either target identity\n(i.e., shape) being leaked or the target non-identity attributes (i.e.,\nbackground, hair) failing to be fully preserved in the final results. We show\nthat this insufficient disentanglement is caused by two flawed designs that\nwere commonly adopted in prior models: (1) counting on only one compressed\nencoder to represent both the semantic-level non-identity facial\nattributes(i.e., pose) and the pixel-level non-facial region details, which is\ncontradictory to satisfy at the same time; (2) highly relying on long\nskip-connections between the encoder and the final generator, leaking a certain\namount of target face identity into the result. To fix them, we introduce a new\nface swap framework called 'WSC-swap' that gets rid of skip connections and\nuses two target encoders to respectively capture the pixel-level non-facial\nregion attributes and the semantic non-identity attributes in the face region.\nTo further reinforce the disentanglement learning for the target encoder, we\nemploy both identity removal loss via adversarial training (i.e., GAN) and the\nnon-identity preservation loss via prior 3DMM models like [11]. Extensive\nexperiments on both FaceForensics++ and CelebA-HQ show that our results\nsignificantly outperform previous works on a rich set of metrics, including one\nnovel metric for measuring identity consistency that was completely neglected\nbefore.\n","authors":["Xiaohang Ren","Xingyu Chen","Pengfei Yao","Heung-Yeung Shum","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.07928v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.03135v2","updated":"2023-07-19T01:28:30Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n Generalizability","summary":" Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Code released at\nhttps://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v2.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.09153v2","updated":"2023-07-19T01:27:17Z","published":"2023-07-18T11:24:42Z","title":"OPHAvatars: One-shot Photo-realistic Head Avatars","summary":" We propose a method for synthesizing photo-realistic digital avatars from\nonly one portrait as the reference. Given a portrait, our method synthesizes a\ncoarse talking head video using driving keypoints features. And with the coarse\nvideo, our method synthesizes a coarse talking head avatar with a deforming\nneural radiance field. With rendered images of the coarse avatar, our method\nupdates the low-quality images with a blind face restoration model. With\nupdated images, we retrain the avatar for higher quality. After several\niterations, our method can synthesize a photo-realistic animatable 3D neural\nhead avatar. The motivation of our method is deformable neural radiance field\ncan eliminate the unnatural distortion caused by the image2video method. Our\nmethod outperforms state-of-the-art methods in quantitative and qualitative\nstudies on various subjects.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2307.09153v2.pdf","comment":"code: https://github.com/lsx0101/OPHAvatars"},{"id":"http://arxiv.org/abs/2307.09696v1","updated":"2023-07-19T00:41:39Z","published":"2023-07-19T00:41:39Z","title":"Towards Saner Deep Image Registration","summary":" With recent advances in computing hardware and surges of deep-learning\narchitectures, learning-based deep image registration methods have surpassed\ntheir traditional counterparts, in terms of metric performance and inference\ntime. However, these methods focus on improving performance measurements such\nas Dice, resulting in less attention given to model behaviors that are equally\ndesirable for registrations, especially for medical imaging. This paper\ninvestigates these behaviors for popular learning-based deep registrations\nunder a sanity-checking microscope. We find that most existing registrations\nsuffer from low inverse consistency and nondiscrimination of identical pairs\ndue to overly optimized image similarities. To rectify these behaviors, we\npropose a novel regularization-based sanity-enforcer method that imposes two\nsanity checks on the deep model to reduce its inverse consistency errors and\nincrease its discriminative power simultaneously. Moreover, we derive a set of\ntheoretical guarantees for our sanity-checked image registration method, with\nexperimental results supporting our theoretical findings and their\neffectiveness in increasing the sanity of models without sacrificing any\nperformance. Our code and models are available at\n\\url{https://github.com/tuffr5/Saner-deep-registration}.\n","authors":["Bin Duan","Ming Zhong","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2307.09696v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09693v1","updated":"2023-07-19T00:36:05Z","published":"2023-07-19T00:36:05Z","title":"GlobalMapper: Arbitrary-Shaped Urban Layout Generation","summary":" Modeling and designing urban building layouts is of significant interest in\ncomputer vision, computer graphics, and urban applications. A building layout\nconsists of a set of buildings in city blocks defined by a network of roads. We\nobserve that building layouts are discrete structures, consisting of multiple\nrows of buildings of various shapes, and are amenable to skeletonization for\nmapping arbitrary city block shapes to a canonical form. Hence, we propose a\nfully automatic approach to building layout generation using graph attention\nnetworks. Our method generates realistic urban layouts given arbitrary road\nnetworks, and enables conditional generation based on learned priors. Our\nresults, including user study, demonstrate superior performance as compared to\nprior layout generation networks, support arbitrary city block and varying\nbuilding shapes as demonstrated by generating layouts for 28 large cities.\n","authors":["Liu He","Daniel Aliaga"],"pdf_url":"https://arxiv.org/pdf/2307.09693v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10501v1","updated":"2023-07-19T23:57:39Z","published":"2023-07-19T23:57:39Z","title":"Eye Disease Classification Using Deep Learning Techniques","summary":" Eye is the essential sense organ for vision function. Due to the fact that\ncertain eye disorders might result in vision loss, it is essential to diagnose\nand treat eye diseases early on. By identifying common eye illnesses and\nperforming an eye check, eye care providers can safeguard patients against\nvision loss or blindness. Convolutional neural networks (CNN) and transfer\nlearning were employed in this study to discriminate between a normal eye and\none with diabetic retinopathy, cataract, or glaucoma disease. Using transfer\nlearning for multi-class classification, high accuracy was achieved at 94%\nwhile the traditional CNN achieved 84% rate.\n","authors":["Tareq Babaqi","Manar Jaradat","Ayse Erdem Yildirim","Saif H. Al-Nimer","Daehan Won"],"pdf_url":"https://arxiv.org/pdf/2307.10501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10499v1","updated":"2023-07-19T23:55:15Z","published":"2023-07-19T23:55:15Z","title":"Mining Conditional Part Semantics with Occluded Extrapolation for\n Human-Object Interaction Detection","summary":" Human-Object Interaction Detection is a crucial aspect of human-centric scene\nunderstanding, with important applications in various domains. Despite recent\nprogress in this field, recognizing subtle and detailed interactions remains\nchallenging. Existing methods try to use human-related clues to alleviate the\ndifficulty, but rely heavily on external annotations or knowledge, limiting\ntheir practical applicability in real-world scenarios. In this work, we propose\na novel Part Semantic Network (PSN) to solve this problem. The core of PSN is a\nConditional Part Attention (CPA) mechanism, where human features are taken as\nkeys and values, and the object feature is used as query for the computation in\na cross-attention mechanism. In this way, our model learns to automatically\nfocus on the most informative human parts conditioned on the involved object,\ngenerating more semantically meaningful features for interaction recognition.\nAdditionally, we propose an Occluded Part Extrapolation (OPE) strategy to\nfacilitate interaction recognition under occluded scenarios, which teaches the\nmodel to extrapolate detailed features from partially occluded ones. Our method\nconsistently outperforms prior approaches on the V-COCO and HICO-DET datasets,\nwithout external data or extra annotations. Additional ablation studies\nvalidate the effectiveness of each component of our proposed method.\n","authors":["Guangzhi Wang","Yangyang Guo","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.10499v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.10495v1","updated":"2023-07-19T23:25:21Z","published":"2023-07-19T23:25:21Z","title":"Novel Batch Active Learning Approach and Its Application to Synthetic\n Aperture Radar Datasets","summary":" Active learning improves the performance of machine learning methods by\njudiciously selecting a limited number of unlabeled data points to query for\nlabels, with the aim of maximally improving the underlying classifier's\nperformance. Recent gains have been made using sequential active learning for\nsynthetic aperture radar (SAR) data arXiv:2204.00005. In each iteration,\nsequential active learning selects a query set of size one while batch active\nlearning selects a query set of multiple datapoints. While batch active\nlearning methods exhibit greater efficiency, the challenge lies in maintaining\nmodel accuracy relative to sequential active learning methods. We developed a\nnovel, two-part approach for batch active learning: Dijkstra's Annulus Core-Set\n(DAC) for core-set generation and LocalMax for batch sampling. The batch active\nlearning process that combines DAC and LocalMax achieves nearly identical\naccuracy as sequential active learning but is more efficient, proportional to\nthe batch size. As an application, a pipeline is built based on transfer\nlearning feature embedding, graph learning, DAC, and LocalMax to classify the\nFUSAR-Ship and OpenSARShip datasets. Our pipeline outperforms the\nstate-of-the-art CNN-based methods.\n","authors":["James Chapman","Bohan Chen","Zheng Tan","Jeff Calder","Kevin Miller","Andrea L. Bertozzi"],"pdf_url":"https://arxiv.org/pdf/2307.10495v1.pdf","comment":"16 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2307.10487v1","updated":"2023-07-19T22:46:35Z","published":"2023-07-19T22:46:35Z","title":"Backdoor Attack against Object Detection with Clean Annotation","summary":" Deep neural networks (DNNs) have shown unprecedented success in object\ndetection tasks. However, it was also discovered that DNNs are vulnerable to\nmultiple kinds of attacks, including Backdoor Attacks. Through the attack, the\nattacker manages to embed a hidden backdoor into the DNN such that the model\nbehaves normally on benign data samples, but makes attacker-specified judgments\ngiven the occurrence of a predefined trigger. Although numerous backdoor\nattacks have been experimented on image classification, backdoor attacks on\nobject detection tasks have not been properly investigated and explored. As\nobject detection has been adopted as an important module in multiple\nsecurity-sensitive applications such as autonomous driving, backdoor attacks on\nobject detection could pose even more severe threats. Inspired by the inherent\nproperty of deep learning-based object detectors, we propose a simple yet\neffective backdoor attack method against object detection without modifying the\nground truth annotations, specifically focusing on the object disappearance\nattack and object generation attack. Extensive experiments and ablation studies\nprove the effectiveness of our attack on two benchmark object detection\ndatasets, PASCAL VOC07+12 and MSCOCO, on which we achieve an attack success\nrate of more than 92% with a poison rate of only 5%.\n","authors":["Yize Cheng","Wenbin Hu","Minhao Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.10487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10475v1","updated":"2023-07-19T22:14:49Z","published":"2023-07-19T22:14:49Z","title":"Findings of Factify 2: Multimodal Fake News Detection","summary":" With social media usage growing exponentially in the past few years, fake\nnews has also become extremely prevalent. The detrimental impact of fake news\nemphasizes the need for research focused on automating the detection of false\ninformation and verifying its accuracy. In this work, we present the outcome of\nthe Factify 2 shared task, which provides a multi-modal fact verification and\nsatire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data\ncalls for a comparison based approach to the task by pairing social media\nclaims with supporting documents, with both text and image, divided into 5\nclasses based on multi-modal relations. In the second iteration of this task we\nhad over 60 participants and 9 final test-set submissions. The best\nperformances came from the use of DeBERTa for text and Swinv2 and CLIP for\nimage. The highest F1 score averaged for all five classes was 81.82%.\n","authors":["S Suryavardan","Shreyash Mishra","Megha Chakraborty","Parth Patwa","Anku Rani","Aman Chadha","Aishwarya Reganti","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.10475v1.pdf","comment":"Defactify2 @AAAI 2023"},{"id":"http://arxiv.org/abs/2307.10471v1","updated":"2023-07-19T21:45:07Z","published":"2023-07-19T21:45:07Z","title":"Classification of Visualization Types and Perspectives in Patents","summary":" Due to the swift growth of patent applications each year, information and\nmultimedia retrieval approaches that facilitate patent exploration and\nretrieval are of utmost importance. Different types of visualizations (e.g.,\ngraphs, technical drawings) and perspectives (e.g., side view, perspective) are\nused to visualize details of innovations in patents. The classification of\nthese images enables a more efficient search and allows for further analysis.\nSo far, datasets for image type classification miss some important\nvisualization types for patents. Furthermore, related work does not make use of\nrecent deep learning approaches including transformers. In this paper, we adopt\nstate-of-the-art deep learning methods for the classification of visualization\ntypes and perspectives in patent images. We extend the CLEF-IP dataset for\nimage type classification in patents to ten classes and provide manual ground\ntruth annotations. In addition, we derive a set of hierarchical classes from a\ndataset that provides weakly-labeled data for image perspectives. Experimental\nresults have demonstrated the feasibility of the proposed approaches. Source\ncode, models, and dataset will be made publicly available.\n","authors":["Junaid Ahmed Ghauri","Eric Müller-Budack","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2307.10471v1.pdf","comment":"Accepted in International Conference on Theory and Practice of\n Digital Libraries (TPDL) 2023 (They have the copyright to publish\n camera-ready version of this work)"},{"id":"http://arxiv.org/abs/2307.10455v1","updated":"2023-07-19T20:54:08Z","published":"2023-07-19T20:54:08Z","title":"A Step Towards Worldwide Biodiversity Assessment: The BIOSCAN-1M Insect\n Dataset","summary":" In an effort to catalog insect biodiversity, we propose a new large dataset\nof hand-labelled insect images, the BIOSCAN-Insect Dataset. Each record is\ntaxonomically classified by an expert, and also has associated genetic\ninformation including raw nucleotide barcode sequences and assigned barcode\nindex numbers, which are genetically-based proxies for species classification.\nThis paper presents a curated million-image dataset, primarily to train\ncomputer-vision models capable of providing image-based taxonomic assessment,\nhowever, the dataset also presents compelling characteristics, the study of\nwhich would be of interest to the broader machine learning community. Driven by\nthe biological nature inherent to the dataset, a characteristic long-tailed\nclass-imbalance distribution is exhibited. Furthermore, taxonomic labelling is\na hierarchical classification scheme, presenting a highly fine-grained\nclassification problem at lower levels. Beyond spurring interest in\nbiodiversity research within the machine learning community, progress on\ncreating an image-based taxonomic classifier will also further the ultimate\ngoal of all BIOSCAN research: to lay the foundation for a comprehensive survey\nof global biodiversity. This paper introduces the dataset and explores the\nclassification task through the implementation and analysis of a baseline\nclassifier.\n","authors":["Zahra Gharaee","ZeMing Gong","Nicholas Pellegrino","Iuliia Zarubiieva","Joakim Bruslund Haurum","Scott C. Lowe","Jaclyn T. A. McKeown","Chris C. Y. Ho","Joschka McLeod","Yi-Yun C Wei","Jireh Agda","Sujeevan Ratnasingham","Dirk Steinke","Angel X. Chang","Graham W. Taylor","Paul Fieguth"],"pdf_url":"https://arxiv.org/pdf/2307.10455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10440v1","updated":"2023-07-19T20:11:30Z","published":"2023-07-19T20:11:30Z","title":"Confidence Estimation Using Unlabeled Data","summary":" Overconfidence is a common issue for deep neural networks, limiting their\ndeployment in real-world applications. To better estimate confidence, existing\nmethods mostly focus on fully-supervised scenarios and rely on training labels.\nIn this paper, we propose the first confidence estimation method for a\nsemi-supervised setting, when most training labels are unavailable. We\nstipulate that even with limited training labels, we can still reasonably\napproximate the confidence of model on unlabeled samples by inspecting the\nprediction consistency through the training process. We use training\nconsistency as a surrogate function and propose a consistency ranking loss for\nconfidence estimation. On both image classification and segmentation tasks, our\nmethod achieves state-of-the-art performances in confidence estimation.\nFurthermore, we show the benefit of the proposed method through a downstream\nactive learning task. The code is available at\nhttps://github.com/TopoXLab/consistency-ranking-loss\n","authors":["Chen Li","Xiaoling Hu","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.10440v1.pdf","comment":"Accepted by ICLR'23"},{"id":"http://arxiv.org/abs/2105.11166v6","updated":"2023-07-19T19:32:53Z","published":"2021-05-24T09:16:04Z","title":"AirNet: Neural Network Transmission over the Air","summary":" State-of-the-art performance for many edge applications is achieved by deep\nneural networks (DNNs). Often, these DNNs are location- and time-sensitive, and\nmust be delivered over a wireless channel rapidly and efficiently. In this\npaper, we introduce AirNet, a family of novel training and transmission methods\nthat allow DNNs to be efficiently delivered over wireless channels under\nstringent transmit power and latency constraints. This corresponds to a new\nclass of joint source-channel coding problems, aimed at delivering DNNs with\nthe goal of maximizing their accuracy at the receiver, rather than recovering\nthem with high fidelity. In AirNet, we propose the direct mapping of the DNN\nparameters to transmitted channel symbols, while the network is trained to meet\nthe channel constraints, and exhibit robustness against channel noise. AirNet\nachieves higher accuracy compared to separation-based alternatives. We further\nimprove the performance of AirNet by pruning the network below the available\nbandwidth, and expanding it for improved robustness. We also benefit from\nunequal error protection by selectively expanding important layers of the\nnetwork. Finally, we develop an approach, which simultaneously trains a\nspectrum of DNNs, each targeting a different channel condition, resolving the\nimpractical memory requirements of training distinct networks for different\nchannel conditions.\n","authors":["Mikolaj Jankowski","Deniz Gunduz","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2105.11166v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09826v2","updated":"2023-07-19T19:30:52Z","published":"2023-04-16T11:22:59Z","title":"Fairness in AI and Its Long-Term Implications on Society","summary":" Successful deployment of artificial intelligence (AI) in various settings has\nled to numerous positive outcomes for individuals and society. However, AI\nsystems have also been shown to harm parts of the population due to biased\npredictions. AI fairness focuses on mitigating such biases to ensure AI\ndecision making is not discriminatory towards certain groups. We take a closer\nlook at AI fairness and analyze how lack of AI fairness can lead to deepening\nof biases over time and act as a social stressor. More specifically, we discuss\nhow biased models can lead to more negative real-world outcomes for certain\ngroups, which may then become more prevalent by deploying new AI models trained\non increasingly biased data, resulting in a feedback loop. If the issues\npersist, they could be reinforced by interactions with other risks and have\nsevere implications on society in the form of social unrest. We examine current\nstrategies for improving AI fairness, assess their limitations in terms of\nreal-world deployment, and explore potential paths forward to ensure we reap\nAI's benefits without causing society's collapse.\n","authors":["Ondrej Bohdal","Timothy Hospedales","Philip H. S. Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2304.09826v2.pdf","comment":"Stanford Existential Risks Conference 2023"},{"id":"http://arxiv.org/abs/2307.10422v1","updated":"2023-07-19T19:19:13Z","published":"2023-07-19T19:19:13Z","title":"PreDiff: Precipitation Nowcasting with Latent Diffusion Models","summary":" Earth system forecasting has traditionally relied on complex physical models\nthat are computationally expensive and require significant domain expertise. In\nthe past decade, the unprecedented increase in spatiotemporal Earth observation\ndata has enabled data-driven forecasting models using deep learning techniques.\nThese models have shown promise for diverse Earth system forecasting tasks but\neither struggle with handling uncertainty or neglect domain-specific prior\nknowledge, resulting in averaging possible futures to blurred forecasts or\ngenerating physically implausible predictions. To address these limitations, we\npropose a two-stage pipeline for probabilistic spatiotemporal forecasting: 1)\nWe develop PreDiff, a conditional latent diffusion model capable of\nprobabilistic forecasts. 2) We incorporate an explicit knowledge control\nmechanism to align forecasts with domain-specific physical constraints. This is\nachieved by estimating the deviation from imposed constraints at each denoising\nstep and adjusting the transition distribution accordingly. We conduct\nempirical studies on two datasets: N-body MNIST, a synthetic dataset with\nchaotic behavior, and SEVIR, a real-world precipitation nowcasting dataset.\nSpecifically, we impose the law of conservation of energy in N-body MNIST and\nanticipated precipitation intensity in SEVIR. Experiments demonstrate the\neffectiveness of PreDiff in handling uncertainty, incorporating domain-specific\nprior knowledge, and generating forecasts that exhibit high operational\nutility.\n","authors":["Zhihan Gao","Xingjian Shi","Boran Han","Hao Wang","Xiaoyong Jin","Danielle Maddix","Yi Zhu","Mu Li","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10422v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2307.10408v1","updated":"2023-07-19T18:37:57Z","published":"2023-07-19T18:37:57Z","title":"Explaining Autonomous Driving Actions with Visual Question Answering","summary":" The end-to-end learning ability of self-driving vehicles has achieved\nsignificant milestones over the last decade owing to rapid advances in deep\nlearning and computer vision algorithms. However, as autonomous driving\ntechnology is a safety-critical application of artificial intelligence (AI),\nroad accidents and established regulatory principles necessitate the need for\nthe explainability of intelligent action choices for self-driving vehicles. To\nfacilitate interpretability of decision-making in autonomous driving, we\npresent a Visual Question Answering (VQA) framework, which explains driving\nactions with question-answering-based causal reasoning. To do so, we first\ncollect driving videos in a simulation environment using reinforcement learning\n(RL) and extract consecutive frames from this log data uniformly for five\nselected action categories. Further, we manually annotate the extracted frames\nusing question-answer pairs as justifications for the actions chosen in each\nscenario. Finally, we evaluate the correctness of the VQA-predicted answers for\nactions on unseen driving scenes. The empirical results suggest that the VQA\nmechanism can provide support to interpret real-time decisions of autonomous\nvehicles and help enhance overall driving safety.\n","authors":["Shahin Atakishiyev","Mohammad Salameh","Housam Babiker","Randy Goebel"],"pdf_url":"https://arxiv.org/pdf/2307.10408v1.pdf","comment":"Accepted to the 2023 IEEE International Conference on Intelligent\n Transportation Systems (IEEE ITSC-2023)"},{"id":"http://arxiv.org/abs/2307.10404v1","updated":"2023-07-19T18:19:18Z","published":"2023-07-19T18:19:18Z","title":"Interpreting and Correcting Medical Image Classification with PIP-Net","summary":" Part-prototype models are explainable-by-design image classifiers, and a\npromising alternative to black box AI. This paper explores the applicability\nand potential of interpretable machine learning, in particular PIP-Net, for\nautomated diagnosis support on real-world medical imaging data. PIP-Net learns\nhuman-understandable prototypical image parts and we evaluate its accuracy and\ninterpretability for fracture detection and skin cancer diagnosis. We find that\nPIP-Net's decision making process is in line with medical classification\nstandards, while only provided with image-level class labels. Because of\nPIP-Net's unsupervised pretraining of prototypes, data quality problems such as\nundesired text in an X-ray or labelling errors can be easily identified.\nAdditionally, we are the first to show that humans can manually correct the\nreasoning of PIP-Net by directly disabling undesired prototypes. We conclude\nthat part-prototype models are promising for medical applications due to their\ninterpretability and potential for advanced model debugging.\n","authors":["Meike Nauta","Johannes H. Hegeman","Jeroen Geerdink","Jörg Schlötterer","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.10404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10387v1","updated":"2023-07-19T18:00:32Z","published":"2023-07-19T18:00:32Z","title":"POV-Surgery: A Dataset for Egocentric Hand and Tool Pose Estimation\n During Surgical Activities","summary":" The surgical usage of Mixed Reality (MR) has received growing attention in\nareas such as surgical navigation systems, skill assessment, and robot-assisted\nsurgeries. For such applications, pose estimation for hand and surgical\ninstruments from an egocentric perspective is a fundamental task and has been\nstudied extensively in the computer vision field in recent years. However, the\ndevelopment of this field has been impeded by a lack of datasets, especially in\nthe surgical field, where bloody gloves and reflective metallic tools make it\nhard to obtain 3D pose annotations for hands and objects using conventional\nmethods. To address this issue, we propose POV-Surgery, a large-scale,\nsynthetic, egocentric dataset focusing on pose estimation for hands with\ndifferent surgical gloves and three orthopedic surgical instruments, namely\nscalpel, friem, and diskplacer. Our dataset consists of 53 sequences and 88,329\nframes, featuring high-resolution RGB-D video streams with activity\nannotations, accurate 3D and 2D annotations for hand-object pose, and 2D\nhand-object segmentation masks. We fine-tune the current SOTA methods on\nPOV-Surgery and further show the generalizability when applying to real-life\ncases with surgical gloves and tools by extensive evaluations. The code and the\ndataset are publicly available at batfacewayne.github.io/POV_Surgery_io/.\n","authors":["Rui Wang","Sophokles Ktistakis","Siwei Zhang","Mirko Meboldt","Quentin Lohmeyer"],"pdf_url":"https://arxiv.org/pdf/2307.10387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10373v1","updated":"2023-07-19T18:00:03Z","published":"2023-07-19T18:00:03Z","title":"TokenFlow: Consistent Diffusion Features for Consistent Video Editing","summary":" The generative AI revolution has recently expanded to videos. Nevertheless,\ncurrent state-of-the-art video models are still lagging behind image models in\nterms of visual quality and user control over the generated content. In this\nwork, we present a framework that harnesses the power of a text-to-image\ndiffusion model for the task of text-driven video editing. Specifically, given\na source video and a target text-prompt, our method generates a high-quality\nvideo that adheres to the target text, while preserving the spatial layout and\nmotion of the input video. Our method is based on a key observation that\nconsistency in the edited video can be obtained by enforcing consistency in the\ndiffusion feature space. We achieve this by explicitly propagating diffusion\nfeatures based on inter-frame correspondences, readily available in the model.\nThus, our framework does not require any training or fine-tuning, and can work\nin conjunction with any off-the-shelf text-to-image editing method. We\ndemonstrate state-of-the-art editing results on a variety of real-world videos.\nWebpage: https://diffusion-tokenflow.github.io/\n","authors":["Michal Geyer","Omer Bar-Tal","Shai Bagon","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2307.10373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10350v1","updated":"2023-07-19T17:47:12Z","published":"2023-07-19T17:47:12Z","title":"Improving Multimodal Datasets with Image Captioning","summary":" Massive web datasets play a key role in the success of large vision-language\nmodels like CLIP and Flamingo. However, the raw web data is noisy, and existing\nfiltering methods to reduce noise often come at the expense of data diversity.\nOur work focuses on caption quality as one major source of noise, and studies\nhow generated captions can increase the utility of web-scraped datapoints with\nnondescript text. Through exploring different mixing strategies for raw and\ngenerated captions, we outperform the best filtering method proposed by the\nDataComp benchmark by 2% on ImageNet and 4% on average across 38 tasks, given a\ncandidate pool of 128M image-text pairs. Our best approach is also 2x better at\nFlickr and MS-COCO retrieval. We then analyze what makes synthetic captions an\neffective source of text supervision. In experimenting with different image\ncaptioning models, we also demonstrate that the performance of a model on\nstandard image captioning benchmarks (e.g., NoCaps CIDEr) is not a reliable\nindicator of the utility of the captions it generates for multimodal training.\nFinally, our experiments with using generated captions at DataComp's large\nscale (1.28B image-text pairs) offer insights into the limitations of synthetic\ntext, as well as the importance of image curation with increasing training data\nquantity.\n","authors":["Thao Nguyen","Samir Yitzhak Gadre","Gabriel Ilharco","Sewoong Oh","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2307.10350v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.09989v1","updated":"2023-07-19T13:49:35Z","published":"2023-07-19T13:49:35Z","title":"UniMatch: A Unified User-Item Matching Framework for the Multi-purpose\n Merchant Marketing","summary":" When doing private domain marketing with cloud services, the merchants\nusually have to purchase different machine learning models for the multiple\nmarketing purposes, leading to a very high cost. We present a unified user-item\nmatching framework to simultaneously conduct item recommendation and user\ntargeting with just one model. We empirically demonstrate that the above\nconcurrent modeling is viable via modeling the user-item interaction matrix\nwith the multinomial distribution, and propose a bidirectional bias-corrected\nNCE loss for the implementation. The proposed loss function guides the model to\nlearn the user-item joint probability $p(u,i)$ instead of the conditional\nprobability $p(i|u)$ or $p(u|i)$ through correcting both the users and items'\nbiases caused by the in-batch negative sampling. In addition, our framework is\nmodel-agnostic enabling a flexible adaptation of different model architectures.\nExtensive experiments demonstrate that our framework results in significant\nperformance gains in comparison with the state-of-the-art methods, with greatly\nreduced cost on computing resources and daily maintenance.\n","authors":["Qifang Zhao","Tianyu Li","Meng Du","Yu Jiang","Qinghui Sun","Zhongyao Wang","Hong Liu","Huan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09985v1","updated":"2023-07-19T13:44:32Z","published":"2023-07-19T13:44:32Z","title":"Our Model Achieves Excellent Performance on MovieLens: What Does it\n Mean?","summary":" A typical benchmark dataset for recommender system (RecSys) evaluation\nconsists of user-item interactions generated on a platform within a time\nperiod. The interaction generation mechanism partially explains why a user\ninteracts with (e.g.,like, purchase, rate) an item, and the context of when a\nparticular interaction happened. In this study, we conduct a meticulous\nanalysis on the MovieLens dataset and explain the potential impact on using the\ndataset for evaluating recommendation algorithms. We make a few main findings\nfrom our analysis. First, there are significant differences in user\ninteractions at the different stages when a user interacts with the MovieLens\nplatform. The early interactions largely define the user portrait which affect\nthe subsequent interactions. Second, user interactions are highly affected by\nthe candidate movies that are recommended by the platform's internal\nrecommendation algorithm(s). Removal of interactions that happen nearer to the\nlast few interactions of a user leads to increasing difficulty in learning user\npreference, thus deteriorating recommendation accuracy. Third, changing the\norder of user interactions makes it more difficult for sequential algorithms to\ncapture the progressive interaction process. Based on these findings, we\nfurther discuss the discrepancy between the interaction generation mechanism\nthat is employed by the MovieLens system and that of typical real world\nrecommendation scenarios. In summary, models that achieve excellent\nrecommendation accuracy on the MovieLens dataset may not demonstrate superior\nperformance in practice for at least two kinds of differences: (i) the\ndifferences in the contexts of user-item interaction generation, and (ii) the\ndifferences in user knowledge about the item collections.\n","authors":["Yu-chen Fan","Yitong Ji","Jie Zhang","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2307.09985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09834v1","updated":"2023-07-19T08:44:11Z","published":"2023-07-19T08:44:11Z","title":"Who Provides the Largest Megaphone? The Role of Google News in Promoting\n Russian State-Affiliated News Sources","summary":" The Internet has not only digitized but also democratized information access\nacross the globe. This gradual but path-breaking move to online information\npropagation has resulted in search engines playing an increasingly prominent\nrole in shaping access to human knowledge. When an Internet user enters a\nquery, the search engine sorts through the hundreds of billions of possible\nwebpages to determine what to show. Google dominates the search engine market,\nwith Google Search surpassing 80% market share globally every year of the last\ndecade. Only in Russia and China do Google competitors claim more market share,\nwith approximately 60% of Internet users in Russia preferring Yandex (compared\nto 40% in favor of Google) and more than 80% of China's Internet users\naccessing Baidu as of 2022. Notwithstanding this long-standing regional\nvariation in Internet search providers, there is limited research showing how\nthese providers compare in terms of propagating state-sponsored information.\nOur study fills this research gap by focusing on Russian cyberspace and\nexamining how Google and Yandex's search algorithms rank content from Russian\nstate-controlled media (hereon, RSM) outlets. This question is timely and of\npractical interest given widespread reports indicating that RSM outlets have\nactively engaged in promoting Kremlin propaganda in the lead-up to, and in the\naftermath of, the Russian invasion of Ukraine in February 2022.\n","authors":["Keeley Erhardt","Saurabh Khanna"],"pdf_url":"https://arxiv.org/pdf/2307.09834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09775v1","updated":"2023-07-19T06:31:58Z","published":"2023-07-19T06:31:58Z","title":"DisCover: Disentangled Music Representation Learning for Cover Song\n Identification","summary":" In the field of music information retrieval (MIR), cover song identification\n(CSI) is a challenging task that aims to identify cover versions of a query\nsong from a massive collection. Existing works still suffer from high\nintra-song variances and inter-song correlations, due to the entangled nature\nof version-specific and version-invariant factors in their modeling. In this\nwork, we set the goal of disentangling version-specific and version-invariant\nfactors, which could make it easier for the model to learn invariant music\nrepresentations for unseen query songs. We analyze the CSI task in a\ndisentanglement view with the causal graph technique, and identify the\nintra-version and inter-version effects biasing the invariant learning. To\nblock these effects, we propose the disentangled music representation learning\nframework (DisCover) for CSI. DisCover consists of two critical components: (1)\nKnowledge-guided Disentanglement Module (KDM) and (2) Gradient-based\nAdversarial Disentanglement Module (GADM), which block intra-version and\ninter-version biased effects, respectively. KDM minimizes the mutual\ninformation between the learned representations and version-variant factors\nthat are identified with prior domain knowledge. GADM identifies\nversion-variant factors by simulating the representation transitions between\nintra-song versions, and exploits adversarial distillation for effect blocking.\nExtensive comparisons with best-performing methods and in-depth analysis\ndemonstrate the effectiveness of DisCover and the and necessity of\ndisentanglement for CSI.\n","authors":["Jiahao Xun","Shengyu Zhang","Yanting Yang","Jieming Zhu","Liqun Deng","Zhou Zhao","Zhenhua Dong","Ruiqi Li","Lichao Zhang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09751v1","updated":"2023-07-19T05:23:43Z","published":"2023-07-19T05:23:43Z","title":"Information Retrieval Meets Large Language Models: A Strategic Report\n from Chinese IR Community","summary":" The research field of Information Retrieval (IR) has evolved significantly,\nexpanding beyond traditional search to meet diverse user information needs.\nRecently, Large Language Models (LLMs) have demonstrated exceptional\ncapabilities in text understanding, generation, and knowledge inference,\nopening up exciting avenues for IR research. LLMs not only facilitate\ngenerative retrieval but also offer improved solutions for user understanding,\nmodel evaluation, and user-system interactions. More importantly, the\nsynergistic relationship among IR models, LLMs, and humans forms a new\ntechnical paradigm that is more powerful for information seeking. IR models\nprovide real-time and relevant information, LLMs contribute internal knowledge,\nand humans play a central role of demanders and evaluators to the reliability\nof information services. Nevertheless, significant challenges exist, including\ncomputational costs, credibility concerns, domain-specific limitations, and\nethical considerations. To thoroughly discuss the transformative impact of LLMs\non IR research, the Chinese IR community conducted a strategic workshop in\nApril 2023, yielding valuable insights. This paper provides a summary of the\nworkshop's outcomes, including the rethinking of IR's core values, the mutual\nenhancement of LLMs and IR, the proposal of a novel IR technical paradigm, and\nopen challenges.\n","authors":["Qingyao Ai","Ting Bai","Zhao Cao","Yi Chang","Jiawei Chen","Zhumin Chen","Zhiyong Cheng","Shoubin Dong","Zhicheng Dou","Fuli Feng","Shen Gao","Jiafeng Guo","Xiangnan He","Yanyan Lan","Chenliang Li","Yiqun Liu","Ziyu Lyu","Weizhi Ma","Jun Ma","Zhaochun Ren","Pengjie Ren","Zhiqiang Wang","Mingwen Wang","Jirong Wen","Le Wu","Xin Xin","Jun Xu","Dawei Yin","Peng Zhang","Fan Zhang","Weinan Zhang","Min Zhang","Xiaofei Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.09751v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2208.06265v2","updated":"2023-07-19T05:08:06Z","published":"2022-08-10T08:28:46Z","title":"Trustworthy Recommender Systems","summary":" Recommender systems (RSs) aim to help users to effectively retrieve items of\ntheir interests from a large catalogue. For a quite long period of time,\nresearchers and practitioners have been focusing on developing accurate RSs.\nRecent years have witnessed an increasing number of threats to RSs, coming from\nattacks, system and user generated noise, system bias. As a result, it has\nbecome clear that a strict focus on RS accuracy is limited and the research\nmust consider other important factors, e.g., trustworthiness. For end users, a\ntrustworthy RS (TRS) should not only be accurate, but also transparent,\nunbiased and fair as well as robust to noise or attacks. These observations\nactually led to a paradigm shift of the research on RSs: from accuracy-oriented\nRSs to TRSs. However, researchers lack a systematic overview and discussion of\nthe literature in this novel and fast developing field of TRSs. To this end, in\nthis paper, we provide an overview of TRSs, including a discussion of the\nmotivation and basic concepts of TRSs, a presentation of the challenges in\nbuilding TRSs, and a perspective on the future directions in this area. We also\nprovide a novel conceptual framework to support the construction of TRSs.\n","authors":["Shoujin Wang","Xiuzhen Zhang","Yan Wang","Huan Liu","Francesco Ricci"],"pdf_url":"https://arxiv.org/pdf/2208.06265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09688v1","updated":"2023-07-19T00:08:49Z","published":"2023-07-19T00:08:49Z","title":"Amazon-M2: A Multilingual Multi-locale Shopping Session Dataset for\n Recommendation and Text Generation","summary":" Modeling customer shopping intentions is a crucial task for e-commerce, as it\ndirectly impacts user experience and engagement. Thus, accurately understanding\ncustomer preferences is essential for providing personalized recommendations.\nSession-based recommendation, which utilizes customer session data to predict\ntheir next interaction, has become increasingly popular. However, existing\nsession datasets have limitations in terms of item attributes, user diversity,\nand dataset scale. As a result, they cannot comprehensively capture the\nspectrum of user behaviors and preferences. To bridge this gap, we present the\nAmazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It\nis the first multilingual dataset consisting of millions of user sessions from\nsix different locales, where the major languages of products are English,\nGerman, Japanese, French, Italian, and Spanish. Remarkably, the dataset can\nhelp us enhance personalization and understanding of user preferences, which\ncan benefit various existing tasks as well as enable new tasks. To test the\npotential of the dataset, we introduce three tasks in this work: (1)\nnext-product recommendation, (2) next-product recommendation with domain\nshifts, and (3) next-product title generation. With the above tasks, we\nbenchmark a range of algorithms on our proposed dataset, drawing new insights\nfor further research and practice. In addition, based on the proposed dataset\nand tasks, we hosted a competition in the KDD CUP 2023 and have attracted\nthousands of users and submissions. The winning solutions and the associated\nworkshop can be accessed at our website https://kddcup23.github.io/.\n","authors":["Wei Jin","Haitao Mao","Zheng Li","Haoming Jiang","Chen Luo","Hongzhi Wen","Haoyu Han","Hanqing Lu","Zhengyang Wang","Ruirui Li","Zhen Li","Monica Xiao Cheng","Rahul Goutam","Haiyang Zhang","Karthik Subbian","Suhang Wang","Yizhou Sun","Jiliang Tang","Bing Yin","Xianfeng Tang"],"pdf_url":"https://arxiv.org/pdf/2307.09688v1.pdf","comment":"Dataset for KDD Cup 2023, https://kddcup23.github.io/"},{"id":"http://arxiv.org/abs/2205.11498v2","updated":"2023-07-19T23:05:57Z","published":"2022-05-23T17:53:44Z","title":"Injecting Domain Adaptation with Learning-to-hash for Effective and\n Efficient Zero-shot Dense Retrieval","summary":" Dense retrieval overcome the lexical gap and has shown great success in\nad-hoc information retrieval (IR). Despite their success, dense retrievers are\nexpensive to serve across practical use cases. For use cases requiring to\nsearch from millions of documents, the dense index becomes bulky and requires\nhigh memory usage for storing the index. More recently, learning-to-hash (LTH)\ntechniques, for e.g., BPR and JPQ, produce binary document vectors, thereby\nreducing the memory requirement to efficiently store the dense index. LTH\ntechniques are supervised and finetune the retriever using a ranking loss. They\noutperform their counterparts, i.e., traditional out-of-the-box vector\ncompression techniques such as PCA or PQ. A missing piece from prior work is\nthat existing techniques have been evaluated only in-domain, i.e., on a single\ndataset such as MS MARCO. In our work, we evaluate LTH and vector compression\ntechniques for improving the downstream zero-shot retrieval accuracy of the\nTAS-B dense retriever while maintaining efficiency at inference. Our results\ndemonstrate that, unlike prior work, LTH strategies when applied naively can\nunderperform the zero-shot TAS-B dense retriever on average by up to 14%\nnDCG@10 on the BEIR benchmark. To solve this limitation, in our work, we\npropose an easy yet effective solution of injecting domain adaptation with\nexisting supervised LTH techniques. We experiment with two well-known\nunsupervised domain adaptation techniques: GenQ and GPL. Our domain adaptation\ninjection technique can improve the downstream zero-shot retrieval\neffectiveness for both BPR and JPQ variants of the TAS-B model by on average\n11.5% and 8.2% nDCG@10 while both maintaining 32$\\times$ memory efficiency and\n14$\\times$ and 2$\\times$ speedup respectively in CPU retrieval latency on BEIR.\nAll our code, models, and data are publicly available at\nhttps://github.com/thakur-nandan/income.\n","authors":["Nandan Thakur","Nils Reimers","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2205.11498v2.pdf","comment":"Accepted at ReNeuIR 2023 Workshop"},{"id":"http://arxiv.org/abs/2307.10488v1","updated":"2023-07-19T22:48:02Z","published":"2023-07-19T22:48:02Z","title":"SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot\n Neural Sparse Retrieval","summary":" Traditionally, sparse retrieval systems relied on lexical representations to\nretrieve documents, such as BM25, dominated information retrieval tasks. With\nthe onset of pre-trained transformer models such as BERT, neural sparse\nretrieval has led to a new paradigm within retrieval. Despite the success,\nthere has been limited software supporting different sparse retrievers running\nin a unified, common environment. This hinders practitioners from fairly\ncomparing different sparse models and obtaining realistic evaluation results.\nAnother missing piece is, that a majority of prior work evaluates sparse\nretrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO.\nHowever, a key requirement in practical retrieval systems requires models that\ncan generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In\nthis work, we provide SPRINT, a unified Python toolkit based on Pyserini and\nLucene, supporting a common interface for evaluating neural sparse retrieval.\nThe toolkit currently includes five built-in models: uniCOIL, DeepImpact,\nSPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by\ndefining their term weighting method. Using our toolkit, we establish strong\nand reproducible zero-shot sparse retrieval baselines across the\nwell-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2\nachieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural\nsparse retrievers. In this work, we further uncover the reasons behind its\nperformance gain. We show that SPLADEv2 produces sparse representations with a\nmajority of tokens outside of the original query and document which is often\ncrucial for its performance gains, i.e. a limitation among its other sparse\ncounterparts. We provide our SPRINT toolkit, models, and data used in our\nexperiments publicly here at https://github.com/thakur-nandan/sprint.\n","authors":["Nandan Thakur","Kexin Wang","Iryna Gurevych","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2307.10488v1.pdf","comment":"Accepted at SIGIR 2023 (Resource Track)"},{"id":"http://arxiv.org/abs/2307.10479v1","updated":"2023-07-19T22:20:06Z","published":"2023-07-19T22:20:06Z","title":"Fast Approximate Nearest Neighbor Search with a Dynamic Exploration\n Graph using Continuous Refinement","summary":" For approximate nearest neighbor search, graph-based algorithms have shown to\noffer the best trade-off between accuracy and search time. We propose the\nDynamic Exploration Graph (DEG) which significantly outperforms existing\nalgorithms in terms of search and exploration efficiency by combining two new\nideas: First, a single undirected even regular graph is incrementally built by\npartially replacing existing edges to integrate new vertices and to update old\nneighborhoods at the same time. Secondly, an edge optimization algorithm is\nused to continuously improve the quality of the graph. Combining this ongoing\nrefinement with the graph construction process leads to a well-organized graph\nstructure at all times, resulting in: (1) increased search efficiency, (2)\npredictable index size, (3) guaranteed connectivity and therefore reachability\nof all vertices, and (4) a dynamic graph structure. In addition we investigate\nhow well existing graph-based search systems can handle indexed queries where\nthe seed vertex of a search is the query itself. Such exploration tasks,\ndespite their good starting point, are not necessarily easy. High efficiency in\napproximate nearest neighbor search (ANNS) does not automatically imply good\nperformance in exploratory search. Extensive experiments show that our new\nDynamic Exploration Graph outperforms existing algorithms significantly for\nindexed and unindexed queries.\n","authors":["Nico Hezel","Kai Uwe Barthel","Konstantin Schall","Klaus Jung"],"pdf_url":"https://arxiv.org/pdf/2307.10479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10471v1","updated":"2023-07-19T21:45:07Z","published":"2023-07-19T21:45:07Z","title":"Classification of Visualization Types and Perspectives in Patents","summary":" Due to the swift growth of patent applications each year, information and\nmultimedia retrieval approaches that facilitate patent exploration and\nretrieval are of utmost importance. Different types of visualizations (e.g.,\ngraphs, technical drawings) and perspectives (e.g., side view, perspective) are\nused to visualize details of innovations in patents. The classification of\nthese images enables a more efficient search and allows for further analysis.\nSo far, datasets for image type classification miss some important\nvisualization types for patents. Furthermore, related work does not make use of\nrecent deep learning approaches including transformers. In this paper, we adopt\nstate-of-the-art deep learning methods for the classification of visualization\ntypes and perspectives in patent images. We extend the CLEF-IP dataset for\nimage type classification in patents to ten classes and provide manual ground\ntruth annotations. In addition, we derive a set of hierarchical classes from a\ndataset that provides weakly-labeled data for image perspectives. Experimental\nresults have demonstrated the feasibility of the proposed approaches. Source\ncode, models, and dataset will be made publicly available.\n","authors":["Junaid Ahmed Ghauri","Eric Müller-Budack","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2307.10471v1.pdf","comment":"Accepted in International Conference on Theory and Practice of\n Digital Libraries (TPDL) 2023 (They have the copyright to publish\n camera-ready version of this work)"},{"id":"http://arxiv.org/abs/2109.12509v3","updated":"2023-07-19T21:28:52Z","published":"2021-09-26T06:54:26Z","title":"Deep Exploration for Recommendation Systems","summary":" Modern recommendation systems ought to benefit by probing for and learning\nfrom delayed feedback. Research has tended to focus on learning from a user's\nresponse to a single recommendation. Such work, which leverages methods of\nsupervised and bandit learning, forgoes learning from the user's subsequent\nbehavior. Where past work has aimed to learn from subsequent behavior, there\nhas been a lack of effective methods for probing to elicit informative delayed\nfeedback. Effective exploration through probing for delayed feedback becomes\nparticularly challenging when rewards are sparse. To address this, we develop\ndeep exploration methods for recommendation systems. In particular, we\nformulate recommendation as a sequential decision problem and demonstrate\nbenefits of deep exploration over single-step exploration. Our experiments are\ncarried out with high-fidelity industrial-grade simulators and establish large\nimprovements over existing algorithms.\n","authors":["Zheqing Zhu","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2109.12509v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10323v1","updated":"2023-07-19T07:20:30Z","published":"2023-07-19T07:20:30Z","title":"IncDSI: Incrementally Updatable Document Retrieval","summary":" Differentiable Search Index is a recently proposed paradigm for document\nretrieval, that encodes information about a corpus of documents within the\nparameters of a neural network and directly maps queries to corresponding\ndocuments. These models have achieved state-of-the-art performances for\ndocument retrieval across many benchmarks. These kinds of models have a\nsignificant limitation: it is not easy to add new documents after a model is\ntrained. We propose IncDSI, a method to add documents in real time (about\n20-50ms per document), without retraining the model on the entire dataset (or\neven parts thereof). Instead we formulate the addition of documents as a\nconstrained optimization problem that makes minimal changes to the network\nparameters. Although orders of magnitude faster, our approach is competitive\nwith re-training the model on the whole dataset and enables the development of\ndocument retrieval systems that can be updated with new information in\nreal-time. Our code for IncDSI is available at\nhttps://github.com/varshakishore/IncDSI.\n","authors":["Varsha Kishore","Chao Wan","Justin Lovelace","Yoav Artzi","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2307.10323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00370v2","updated":"2023-07-19T06:55:04Z","published":"2023-07-01T15:44:53Z","title":"Improving Text Matching in E-Commerce Search with A Rationalizable,\n Intervenable and Fast Entity-Based Relevance Model","summary":" Discovering the intended items of user queries from a massive repository of\nitems is one of the main goals of an e-commerce search system. Relevance\nprediction is essential to the search system since it helps improve\nperformance. When online serving a relevance model, the model is required to\nperform fast and accurate inference. Currently, the widely used models such as\nBi-encoder and Cross-encoder have their limitations in accuracy or inference\nspeed respectively. In this work, we propose a novel model called the\nEntity-Based Relevance Model (EBRM). We identify the entities contained in an\nitem and decompose the QI (query-item) relevance problem into multiple QE\n(query-entity) relevance problems; we then aggregate their results to form the\nQI prediction using a soft logic formulation. The decomposition allows us to\nuse a Cross-encoder QE relevance module for high accuracy as well as cache QE\npredictions for fast online inference. Utilizing soft logic makes the\nprediction procedure interpretable and intervenable. We also show that\npretraining the QE module with auto-generated QE data from user logs can\nfurther improve the overall performance. The proposed method is evaluated on\nlabeled data from e-commerce websites. Empirical results show that it achieves\npromising improvements with computation efficiency.\n","authors":["Jiong Cai","Yong Jiang","Yue Zhang","Chengyue Jiang","Ke Yu","Jianhui Ji","Rong Xiao","Haihong Tang","Tao Wang","Zhongqiang Huang","Pengjun Xie","Fei Huang","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2307.00370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10314v1","updated":"2023-07-19T03:31:41Z","published":"2023-07-19T03:31:41Z","title":"Mood Classification of Bangla Songs Based on Lyrics","summary":" Music can evoke various emotions, and with the advancement of technology, it\nhas become more accessible to people. Bangla music, which portrays different\nhuman emotions, lacks sufficient research. The authors of this article aim to\nanalyze Bangla songs and classify their moods based on the lyrics. To achieve\nthis, this research has compiled a dataset of 4000 Bangla song lyrics, genres,\nand used Natural Language Processing and the Bert Algorithm to analyze the\ndata. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362\nfor the romantic mood, 886 for happiness, and the rest 239 are classified as\nrelaxation. By embedding the lyrics of the songs, the authors have classified\nthe songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is\ncrucial as it enables a multi-class classification of songs' moods, making the\nmusic more relatable to people's emotions. The article presents the automated\nresult of the four moods accurately derived from the song lyrics.\n","authors":["Maliha Mahajebin","Mohammad Rifat Ahmmad Rashid","Nafees Mansoor"],"pdf_url":"https://arxiv.org/pdf/2307.10314v1.pdf","comment":"Presented at International Conference on. Inventive Communication and\n Computational Technologies 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.10171v1","updated":"2023-07-19T17:57:27Z","published":"2023-07-19T17:57:27Z","title":"LightPath: Lightweight and Scalable Path Representation Learning","summary":" Movement paths are used widely in intelligent transportation and smart city\napplications. To serve such applications, path representation learning aims to\nprovide compact representations of paths that enable efficient and accurate\noperations when used for different downstream tasks such as path ranking and\ntravel cost estimation. In many cases, it is attractive that the path\nrepresentation learning is lightweight and scalable; in resource-limited\nenvironments and under green computing limitations, it is essential. Yet,\nexisting path representation learning studies focus on accuracy and pay at most\nsecondary attention to resource consumption and scalability.\n We propose a lightweight and scalable path representation learning framework,\ntermed LightPath, that aims to reduce resource consumption and achieve\nscalability without affecting accuracy, thus enabling broader applicability.\nMore specifically, we first propose a sparse auto-encoder that ensures that the\nframework achieves good scalability with respect to path length. Next, we\npropose a relational reasoning framework to enable faster training of more\nrobust sparse path encoders. We also propose global-local knowledge\ndistillation to further reduce the size and improve the performance of sparse\npath encoders. Finally, we report extensive experiments on two real-world\ndatasets to offer insight into the efficiency, scalability, and effectiveness\nof the proposed framework.\n","authors":["Sean Bin Yang","Jilin Hu","Chenjuan Guo","Bin Yang","Christian S. Jensen"],"pdf_url":"https://arxiv.org/pdf/2307.10171v1.pdf","comment":"This paper has been accepted by ACM SIGKDD-23"},{"id":"http://arxiv.org/abs/2212.07383v3","updated":"2023-07-19T17:56:01Z","published":"2022-12-14T18:08:42Z","title":"Sequential Kernelized Independence Testing","summary":" Independence testing is a classical statistical problem that has been\nextensively studied in the batch setting when one fixes the sample size before\ncollecting data. However, practitioners often prefer procedures that adapt to\nthe complexity of a problem at hand instead of setting sample size in advance.\nIdeally, such procedures should (a) stop earlier on easy tasks (and later on\nharder tasks), hence making better use of available resources, and (b)\ncontinuously monitor the data and efficiently incorporate statistical evidence\nafter collecting new data, while controlling the false alarm rate. Classical\nbatch tests are not tailored for streaming data: valid inference after data\npeeking requires correcting for multiple testing which results in low power.\nFollowing the principle of testing by betting, we design sequential kernelized\nindependence tests that overcome such shortcomings. We exemplify our broad\nframework using bets inspired by kernelized dependence measures, e.g., the\nHilbert-Schmidt independence criterion. Our test is also valid under\nnon-i.i.d., time-varying settings. We demonstrate the power of our approaches\non both simulated and real data.\n","authors":["Aleksandr Podkopaev","Patrick Blöbaum","Shiva Prasad Kasiviswanathan","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2212.07383v3.pdf","comment":"To appear at ICML 2023"},{"id":"http://arxiv.org/abs/2307.10169v1","updated":"2023-07-19T17:55:13Z","published":"2023-07-19T17:55:13Z","title":"Challenges and Applications of Large Language Models","summary":" Large Language Models (LLMs) went from non-existent to ubiquitous in the\nmachine learning discourse within a few years. Due to the fast pace of the\nfield, it is difficult to identify the remaining challenges and already\nfruitful application areas. In this paper, we aim to establish a systematic set\nof open problems and application successes so that ML researchers can\ncomprehend the field's current state more quickly and become productive.\n","authors":["Jean Kaddour","Joshua Harris","Maximilian Mozes","Herbie Bradley","Roberta Raileanu","Robert McHardy"],"pdf_url":"https://arxiv.org/pdf/2307.10169v1.pdf","comment":"72 pages. v01. Work in progress. Feedback and comments are highly\n appreciated!"},{"id":"http://arxiv.org/abs/2307.10167v1","updated":"2023-07-19T17:53:22Z","published":"2023-07-19T17:53:22Z","title":"VITS : Variational Inference Thomson Sampling for contextual bandits","summary":" In this paper, we introduce and analyze a variant of the Thompson sampling\n(TS) algorithm for contextual bandits. At each round, traditional TS requires\nsamples from the current posterior distribution, which is usually intractable.\nTo circumvent this issue, approximate inference techniques can be used and\nprovide samples with distribution close to the posteriors. However, current\napproximate techniques yield to either poor estimation (Laplace approximation)\nor can be computationally expensive (MCMC methods, Ensemble sampling...). In\nthis paper, we propose a new algorithm, Varational Inference Thompson sampling\nVITS, based on Gaussian Variational Inference. This scheme provides powerful\nposterior approximations which are easy to sample from, and is computationally\nefficient, making it an ideal choice for TS. In addition, we show that VITS\nachieves a sub-linear regret bound of the same order in the dimension and\nnumber of round as traditional TS for linear contextual bandit. Finally, we\ndemonstrate experimentally the effectiveness of VITS on both synthetic and real\nworld datasets.\n","authors":["Pierre Clavier","Tom Huix","Alain Durmus"],"pdf_url":"https://arxiv.org/pdf/2307.10167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10163v1","updated":"2023-07-19T17:44:54Z","published":"2023-07-19T17:44:54Z","title":"Rethinking Backdoor Attacks","summary":" In a backdoor attack, an adversary inserts maliciously constructed backdoor\nexamples into a training set to make the resulting model vulnerable to\nmanipulation. Defending against such attacks typically involves viewing these\ninserted examples as outliers in the training set and using techniques from\nrobust statistics to detect and remove them.\n In this work, we present a different approach to the backdoor attack problem.\nSpecifically, we show that without structural information about the training\ndata distribution, backdoor attacks are indistinguishable from\nnaturally-occurring features in the data--and thus impossible to \"detect\" in a\ngeneral sense. Then, guided by this observation, we revisit existing defenses\nagainst backdoor attacks and characterize the (often latent) assumptions they\nmake and on which they depend. Finally, we explore an alternative perspective\non backdoor attacks: one that assumes these attacks correspond to the strongest\nfeature in the training data. Under this assumption (which we make formal) we\ndevelop a new primitive for detecting backdoor attacks. Our primitive naturally\ngives rise to a detection algorithm that comes with theoretical guarantees and\nis effective in practice.\n","authors":["Alaa Khaddaj","Guillaume Leclerc","Aleksandar Makelov","Kristian Georgiev","Hadi Salman","Andrew Ilyas","Aleksander Madry"],"pdf_url":"https://arxiv.org/pdf/2307.10163v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.10160v1","updated":"2023-07-19T17:42:36Z","published":"2023-07-19T17:42:36Z","title":"Robust Driving Policy Learning with Guided Meta Reinforcement Learning","summary":" Although deep reinforcement learning (DRL) has shown promising results for\nautonomous navigation in interactive traffic scenarios, existing work typically\nadopts a fixed behavior policy to control social vehicles in the training\nenvironment. This may cause the learned driving policy to overfit the\nenvironment, making it difficult to interact well with vehicles with different,\nunseen behaviors. In this work, we introduce an efficient method to train\ndiverse driving policies for social vehicles as a single meta-policy. By\nrandomizing the interaction-based reward functions of social vehicles, we can\ngenerate diverse objectives and efficiently train the meta-policy through\nguiding policies that achieve specific objectives. We further propose a\ntraining strategy to enhance the robustness of the ego vehicle's driving policy\nusing the environment where social vehicles are controlled by the learned\nmeta-policy. Our method successfully learns an ego driving policy that\ngeneralizes well to unseen situations with out-of-distribution (OOD) social\nagents' behaviors in a challenging uncontrolled T-intersection scenario.\n","authors":["Kanghoon Lee","Jiachen Li","David Isele","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2307.10160v1.pdf","comment":"ITSC 2023"},{"id":"http://arxiv.org/abs/2307.10155v1","updated":"2023-07-19T17:35:08Z","published":"2023-07-19T17:35:08Z","title":"Curvature-based Clustering on Graphs","summary":" Unsupervised node clustering (or community detection) is a classical graph\nlearning task. In this paper, we study algorithms, which exploit the geometry\nof the graph to identify densely connected substructures, which form clusters\nor communities. Our method implements discrete Ricci curvatures and their\nassociated geometric flows, under which the edge weights of the graph evolve to\nreveal its community structure. We consider several discrete curvature notions\nand analyze the utility of the resulting algorithms. In contrast to prior\nliterature, we study not only single-membership community detection, where each\nnode belongs to exactly one community, but also mixed-membership community\ndetection, where communities may overlap. For the latter, we argue that it is\nbeneficial to perform community detection on the line graph, i.e., the graph's\ndual. We provide both theoretical and empirical evidence for the utility of our\ncurvature-based clustering algorithms. In addition, we give several results on\nthe relationship between the curvature of a graph and that of its dual, which\nenable the efficient implementation of our proposed mixed-membership community\ndetection approach and which may be of independent interest for curvature-based\nnetwork analysis.\n","authors":["Yu Tian","Zachary Lubberts","Melanie Weber"],"pdf_url":"https://arxiv.org/pdf/2307.10155v1.pdf","comment":"65 pages, 19 figures"},{"id":"http://arxiv.org/abs/2307.04228v2","updated":"2023-07-19T17:24:29Z","published":"2023-07-09T16:44:37Z","title":"Efficient Bayesian travel-time tomography with geologically-complex\n priors using sensitivity-informed polynomial chaos expansion and deep\n generative networks","summary":" Monte Carlo Markov Chain (MCMC) methods commonly confront two fundamental\nchallenges: the accurate characterization of the prior distribution and the\nefficient evaluation of the likelihood. In the context of Bayesian studies on\ntomography, principal component analysis (PCA) can in some cases facilitate the\nstraightforward definition of the prior distribution, while simultaneously\nenabling the implementation of accurate surrogate models based on polynomial\nchaos expansion (PCE) to replace computationally intensive full-physics forward\nsolvers. When faced with scenarios where PCA does not offer a direct means of\neasily defining the prior distribution alternative methods like deep generative\nmodels (e.g., variational autoencoders (VAEs)), can be employed as viable\noptions. However, accurately producing a surrogate capable of capturing the\nintricate non-linear relationship between the latent parameters of a VAE and\nthe outputs of forward modeling presents a notable challenge. Indeed, while PCE\nmodels provide high accuracy when the input-output relationship can be\neffectively approximated by relatively low-degree multivariate polynomials,\nthis condition is typically unmet when utilizing latent variables derived from\ndeep generative models. In this contribution, we present a strategy that\ncombines the excellent reconstruction performances of VAE in terms of prio\nrepresentation with the accuracy of PCA-PCE surrogate modeling in the context\nof Bayesian ground penetrating radar (GPR) travel-time tomography. Within the\nMCMC process, the parametrization of the VAE is leveraged for prior exploration\nand sample proposal. Concurrently, modeling is conducted using PCE, which\noperates on either globally or locally defined principal components of the VAE\nsamples under examination.\n","authors":["Giovanni Angelo Meles","Macarena Amaya","Shiran Levy","Stefano Marelli","Niklas Linde"],"pdf_url":"https://arxiv.org/pdf/2307.04228v2.pdf","comment":"25 pages, 15 figures"},{"id":"http://arxiv.org/abs/2307.10142v1","updated":"2023-07-19T17:12:28Z","published":"2023-07-19T17:12:28Z","title":"Benchmarking Potential Based Rewards for Learning Humanoid Locomotion","summary":" The main challenge in developing effective reinforcement learning (RL)\npipelines is often the design and tuning the reward functions. Well-designed\nshaping reward can lead to significantly faster learning. Naively formulated\nrewards, however, can conflict with the desired behavior and result in\noverfitting or even erratic performance if not properly tuned. In theory, the\nbroad class of potential based reward shaping (PBRS) can help guide the\nlearning process without affecting the optimal policy. Although several studies\nhave explored the use of potential based reward shaping to accelerate learning\nconvergence, most have been limited to grid-worlds and low-dimensional systems,\nand RL in robotics has predominantly relied on standard forms of reward\nshaping. In this paper, we benchmark standard forms of shaping with PBRS for a\nhumanoid robot. We find that in this high-dimensional system, PBRS has only\nmarginal benefits in convergence speed. However, the PBRS reward terms are\nsignificantly more robust to scaling than typical reward shaping approaches,\nand thus easier to tune.\n","authors":["Se Hwan Jeon","Steve Heim","Charles Khazoom","Sangbae Kim"],"pdf_url":"https://arxiv.org/pdf/2307.10142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08169v2","updated":"2023-07-19T16:45:18Z","published":"2022-09-16T20:52:39Z","title":"Value Summation: A Novel Scoring Function for MPC-based Model-based\n Reinforcement Learning","summary":" This paper proposes a novel scoring function for the planning module of\nMPC-based reinforcement learning methods to address the inherent bias of using\nthe reward function to score trajectories. The proposed method enhances the\nlearning efficiency of existing MPC-based MBRL methods using the discounted sum\nof values. The method utilizes optimal trajectories to guide policy learning\nand updates its state-action value function based on real-world and augmented\nonboard data. The learning efficiency of the proposed method is evaluated in\nselected MuJoCo Gym environments as well as in learning locomotion skills for a\nsimulated model of the Cassie robot. The results demonstrate that the proposed\nmethod outperforms the current state-of-the-art algorithms in terms of learning\nefficiency and average reward return.\n","authors":["Mehran Raisi","Amirhossein Noohian","Luc Mccutcheon","Saber Fallah"],"pdf_url":"https://arxiv.org/pdf/2209.08169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09191v2","updated":"2023-07-19T16:24:31Z","published":"2023-07-17T13:17:26Z","title":"A benchmark of categorical encoders for binary classification","summary":" Categorical encoders transform categorical features into numerical\nrepresentations that are indispensable for a wide range of machine learning\nmodels. Existing encoder benchmark studies lack generalizability because of\ntheir limited choice of (1) encoders, (2) experimental factors, and (3)\ndatasets. Additionally, inconsistencies arise from the adoption of varying\naggregation strategies. This paper is the most comprehensive benchmark of\ncategorical encoders to date, including an extensive evaluation of 32\nconfigurations of encoders from diverse families, with 36 combinations of\nexperimental factors, and on 50 datasets. The study shows the profound\ninfluence of dataset selection, experimental factors, and aggregation\nstrategies on the benchmark's conclusions -- aspects disregarded in previous\nencoder benchmarks.\n","authors":["Federico Matteucci","Vadim Arzamasov","Klemens Boehm"],"pdf_url":"https://arxiv.org/pdf/2307.09191v2.pdf","comment":"Submitted to the 37th Conference on Neural Information Processing\n Systems (NeurIPS 2023) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2103.03328v3","updated":"2023-07-19T16:19:53Z","published":"2021-03-04T20:58:22Z","title":"Evaluation of Complexity Measures for Deep Learning Generalization in\n Medical Image Analysis","summary":" The generalization performance of deep learning models for medical image\nanalysis often decreases on images collected with different devices for data\nacquisition, device settings, or patient population. A better understanding of\nthe generalization capacity on new images is crucial for clinicians'\ntrustworthiness in deep learning. Although significant research efforts have\nbeen recently directed toward establishing generalization bounds and complexity\nmeasures, still, there is often a significant discrepancy between the predicted\nand actual generalization performance. As well, related large empirical studies\nhave been primarily based on validation with general-purpose image datasets.\nThis paper presents an empirical study that investigates the correlation\nbetween 25 complexity measures and the generalization abilities of supervised\ndeep learning classifiers for breast ultrasound images. The results indicate\nthat PAC-Bayes flatness-based and path norm-based measures produce the most\nconsistent explanation for the combination of models and data. We also\ninvestigate the use of multi-task classification and segmentation approach for\nbreast images, and report that such learning approach acts as an implicit\nregularizer and is conducive toward improved generalization.\n","authors":["Aleksandar Vakanski","Min Xian"],"pdf_url":"https://arxiv.org/pdf/2103.03328v3.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2306.13197v2","updated":"2023-07-19T16:19:24Z","published":"2023-06-22T20:42:50Z","title":"Pre or Post-Softmax Scores in Gradient-based Attribution Methods, What\n is Best?","summary":" Gradient based attribution methods for neural networks working as classifiers\nuse gradients of network scores. Here we discuss the practical differences\nbetween using gradients of pre-softmax scores versus post-softmax scores, and\ntheir respective advantages and disadvantages.\n","authors":["Miguel Lerma","Mirtha Lucas"],"pdf_url":"https://arxiv.org/pdf/2306.13197v2.pdf","comment":"8 pages, 2 figures, 2023 IEEE 13th International Conference on\n Pattern Recognition Systems (ICPRS)"},{"id":"http://arxiv.org/abs/2210.12547v2","updated":"2023-07-19T16:16:50Z","published":"2022-10-22T20:42:06Z","title":"SurCo: Learning Linear Surrogates For Combinatorial Nonlinear\n Optimization Problems","summary":" Optimization problems with nonlinear cost functions and combinatorial\nconstraints appear in many real-world applications but remain challenging to\nsolve efficiently compared to their linear counterparts. To bridge this gap, we\npropose $\\textbf{SurCo}$ that learns linear $\\underline{\\text{Sur}}$rogate\ncosts which can be used in existing $\\underline{\\text{Co}}$mbinatorial solvers\nto output good solutions to the original nonlinear combinatorial optimization\nproblem. The surrogate costs are learned end-to-end with nonlinear loss by\ndifferentiating through the linear surrogate solver, combining the flexibility\nof gradient-based methods with the structure of linear combinatorial\noptimization. We propose three $\\texttt{SurCo}$ variants:\n$\\texttt{SurCo}-\\texttt{zero}$ for individual nonlinear problems,\n$\\texttt{SurCo}-\\texttt{prior}$ for problem distributions, and\n$\\texttt{SurCo}-\\texttt{hybrid}$ to combine both distribution and\nproblem-specific information. We give theoretical intuition motivating\n$\\texttt{SurCo}$, and evaluate it empirically. Experiments show that\n$\\texttt{SurCo}$ finds better solutions faster than state-of-the-art and domain\nexpert approaches in real-world optimization problems such as embedding table\nsharding, inverse photonic design, and nonlinear route planning.\n","authors":["Aaron Ferber","Taoan Huang","Daochen Zha","Martin Schubert","Benoit Steiner","Bistra Dilkina","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2210.12547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05783v4","updated":"2023-07-19T16:14:31Z","published":"2023-02-11T21:07:30Z","title":"ConCerNet: A Contrastive Learning Based Framework for Automated\n Conservation Law Discovery and Trustworthy Dynamical System Prediction","summary":" Deep neural networks (DNN) have shown great capacity of modeling a dynamical\nsystem; nevertheless, they usually do not obey physics constraints such as\nconservation laws. This paper proposes a new learning framework named ConCerNet\nto improve the trustworthiness of the DNN based dynamics modeling to endow the\ninvariant properties. ConCerNet consists of two steps: (i) a contrastive\nlearning method to automatically capture the system invariants (i.e.\nconservation properties) along the trajectory observations; (ii) a neural\nprojection layer to guarantee that the learned dynamics models preserve the\nlearned invariants. We theoretically prove the functional relationship between\nthe learned latent representation and the unknown system invariant function.\nExperiments show that our method consistently outperforms the baseline neural\nnetworks in both coordinate error and conservation metrics by a large margin.\nWith neural network based parameterization and no dependence on prior\nknowledge, our method can be extended to complex and large-scale dynamics by\nleveraging an autoencoder.\n","authors":["Wang Zhang","Tsui-Wei Weng","Subhro Das","Alexandre Megretski","Luca Daniel","Lam M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2302.05783v4.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2307.10098v1","updated":"2023-07-19T16:13:13Z","published":"2023-07-19T16:13:13Z","title":"Gradient Sparsification For Masked Fine-Tuning of Transformers","summary":" Fine-tuning pretrained self-supervised language models is widely adopted for\ntransfer learning to downstream tasks. Fine-tuning can be achieved by freezing\ngradients of the pretrained network and only updating gradients of a newly\nadded classification layer, or by performing gradient updates on all\nparameters. Gradual unfreezing makes a trade-off between the two by gradually\nunfreezing gradients of whole layers during training. This has been an\neffective strategy to trade-off between storage and training speed with\ngeneralization performance. However, it is not clear whether gradually\nunfreezing layers throughout training is optimal, compared to sparse variants\nof gradual unfreezing which may improve fine-tuning performance. In this paper,\nwe propose to stochastically mask gradients to regularize pretrained language\nmodels for improving overall fine-tuned performance. We introduce GradDrop and\nvariants thereof, a class of gradient sparsification methods that mask\ngradients during the backward pass, acting as gradient noise. GradDrop is\nsparse and stochastic unlike gradual freezing. Extensive experiments on the\nmultilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive\nagainst methods that use additional translated data for intermediate\npretraining and outperforms standard fine-tuning and gradual unfreezing. A\npost-analysis shows how GradDrop improves performance with languages it was not\ntrained on, such as under-resourced languages.\n","authors":["James O' Neill","Sourav Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10098v1.pdf","comment":"Accepted to IJCNN 2023"},{"id":"http://arxiv.org/abs/2307.10093v1","updated":"2023-07-19T16:00:29Z","published":"2023-07-19T16:00:29Z","title":"Revisiting invariances and introducing priors in Gromov-Wasserstein\n distances","summary":" Gromov-Wasserstein distance has found many applications in machine learning\ndue to its ability to compare measures across metric spaces and its invariance\nto isometric transformations. However, in certain applications, this invariance\nproperty can be too flexible, thus undesirable. Moreover, the\nGromov-Wasserstein distance solely considers pairwise sample similarities in\ninput datasets, disregarding the raw feature representations. We propose a new\noptimal transport-based distance, called Augmented Gromov-Wasserstein, that\nallows for some control over the level of rigidity to transformations. It also\nincorporates feature alignments, enabling us to better leverage prior knowledge\non the input data for improved performance. We present theoretical insights\ninto the proposed metric. We then demonstrate its usefulness for single-cell\nmulti-omic alignment tasks and a transfer learning scenario in machine\nlearning.\n","authors":["Pinar Demetci","Quang Huy Tran","Ievgen Redko","Ritambhara Singh"],"pdf_url":"https://arxiv.org/pdf/2307.10093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04838v2","updated":"2023-07-19T15:59:03Z","published":"2023-07-10T18:15:03Z","title":"CREPE: Learnable Prompting With CLIP Improves Visual Relationship\n Prediction","summary":" In this paper, we explore the potential of Vision-Language Models (VLMs),\nspecifically CLIP, in predicting visual object relationships, which involves\ninterpreting visual features from images into language-based relations. Current\nstate-of-the-art methods use complex graphical models that utilize language\ncues and visual features to address this challenge. We hypothesize that the\nstrong language priors in CLIP embeddings can simplify these graphical models\npaving for a simpler approach. We adopt the UVTransE relation prediction\nframework, which learns the relation as a translational embedding with subject,\nobject, and union box embeddings from a scene. We systematically explore the\ndesign of CLIP-based subject, object, and union-box representations within the\nUVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate\nEstimation). CREPE utilizes text-based representations for all three bounding\nboxes and introduces a novel contrastive training strategy to automatically\ninfer the text prompt for union-box. Our approach achieves state-of-the-art\nperformance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual\nGenome benchmark, achieving a 15.3\\% gain in performance over recent\nstate-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in\nobject relation prediction and encourages further research on VLMs in this\nchallenging domain.\n","authors":["Rakshith Subramanyam","T. S. Jayram","Rushil Anirudh","Jayaraman J. Thiagarajan"],"pdf_url":"https://arxiv.org/pdf/2307.04838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10088v1","updated":"2023-07-19T15:57:24Z","published":"2023-07-19T15:57:24Z","title":"Android in the Wild: A Large-Scale Dataset for Android Device Control","summary":" There is a growing interest in device-control systems that can interpret\nhuman natural language instructions and execute them on a digital device by\ndirectly controlling its user interface. We present a dataset for\ndevice-control research, Android in the Wild (AITW), which is orders of\nmagnitude larger than current datasets. The dataset contains human\ndemonstrations of device interactions, including the screens and actions, and\ncorresponding natural language instructions. It consists of 715k episodes\nspanning 30k unique instructions, four versions of Android (v10-13),and eight\ndevice types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It\ncontains multi-step tasks that require semantic understanding of language and\nvisual context. This dataset poses a new challenge: actions available through\nthe user interface must be inferred from their visual appearance. And, instead\nof simple UI element-based actions, the action space consists of precise\ngestures (e.g., horizontal scrolls to operate carousel widgets). We organize\nour dataset to encourage robustness analysis of device-control systems, i.e.,\nhow well a system performs in the presence of new task descriptions, new\napplications, or new platform versions. We develop two agents and report\nperformance across the dataset. The dataset is available at\nhttps://github.com/google-research/google-research/tree/master/android_in_the_wild.\n","authors":["Christopher Rawles","Alice Li","Daniel Rodriguez","Oriana Riva","Timothy Lillicrap"],"pdf_url":"https://arxiv.org/pdf/2307.10088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10078v1","updated":"2023-07-19T15:51:25Z","published":"2023-07-19T15:51:25Z","title":"A Dual Formulation for Probabilistic Principal Component Analysis","summary":" In this paper, we characterize Probabilistic Principal Component Analysis in\nHilbert spaces and demonstrate how the optimal solution admits a representation\nin dual space. This allows us to develop a generative framework for kernel\nmethods. Furthermore, we show how it englobes Kernel Principal Component\nAnalysis and illustrate its working on a toy and a real dataset.\n","authors":["Henri De Plaen","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2307.10078v1.pdf","comment":"ICML 2023 Workshop on Duality for Modern Machine Learning (DP4ML). 14\n pages (8 main + 5 appendix), 4 figures and 4 tables"},{"id":"http://arxiv.org/abs/2212.00736v2","updated":"2023-07-19T15:43:40Z","published":"2022-12-01T18:29:48Z","title":"An exponentially-growing family of universal quantum circuits","summary":" Quantum machine learning has become an area of growing interest but has\ncertain theoretical and hardware-specific limitations. Notably, the problem of\nvanishing gradients, or barren plateaus, renders the training impossible for\ncircuits with high qubit counts, imposing a limit on the number of qubits that\ndata scientists can use for solving problems. Independently, angle-embedded\nsupervised quantum neural networks were shown to produce truncated Fourier\nseries with a degree directly dependent on two factors: the depth of the\nencoding and the number of parallel qubits the encoding applied to. The degree\nof the Fourier series limits the model expressivity. This work introduces two\nnew architectures whose Fourier degrees grow exponentially: the sequential and\nparallel exponential quantum machine learning architectures. This is done by\nefficiently using the available Hilbert space when encoding, increasing the\nexpressivity of the quantum encoding. Therefore, the exponential growth allows\nstaying at the low-qubit limit to create highly expressive circuits avoiding\nbarren plateaus. Practically, parallel exponential architecture was shown to\noutperform the existing linear architectures by reducing their final mean\nsquare error value by up to 44.7% in a one-dimensional test problem.\nFurthermore, the feasibility of this technique was also shown on a trapped ion\nquantum processing unit.\n","authors":["Mo Kordzanganeh","Pavel Sekatski","Markus Pflitsch","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2212.00736v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.10062v1","updated":"2023-07-19T15:33:11Z","published":"2023-07-19T15:33:11Z","title":"Unsupervised Accuracy Estimation of Deep Visual Models using\n Domain-Adaptive Adversarial Perturbation without Source Samples","summary":" Deploying deep visual models can lead to performance drops due to the\ndiscrepancies between source and target distributions. Several approaches\nleverage labeled source data to estimate target domain accuracy, but accessing\nlabeled source data is often prohibitively difficult due to data\nconfidentiality or resource limitations on serving devices. Our work proposes a\nnew framework to estimate model accuracy on unlabeled target data without\naccess to source data. We investigate the feasibility of using pseudo-labels\nfor accuracy estimation and evolve this idea into adopting recent advances in\nsource-free domain adaptation algorithms. Our approach measures the\ndisagreement rate between the source hypothesis and the target pseudo-labeling\nfunction, adapted from the source hypothesis. We mitigate the impact of\nerroneous pseudo-labels that may arise due to a high ideal joint hypothesis\nrisk by employing adaptive adversarial perturbation on the input of the target\nmodel. Our proposed source-free framework effectively addresses the challenging\ndistribution shift scenarios and outperforms existing methods requiring source\ndata and labels for training.\n","authors":["JoonHo Lee","Jae Oh Woo","Hankyu Moon","Kwonho Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10062v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10060v1","updated":"2023-07-19T15:30:06Z","published":"2023-07-19T15:30:06Z","title":"Accurate deep learning sub-grid scale models for large eddy simulations","summary":" We present two families of sub-grid scale (SGS) turbulence models developed\nfor large-eddy simulation (LES) purposes. Their development required the\nformulation of physics-informed robust and efficient Deep Learning (DL)\nalgorithms which, unlike state-of-the-art analytical modeling techniques can\nproduce high-order complex non-linear relations between inputs and outputs.\nExplicit filtering of data from direct simulations of the canonical channel\nflow at two friction Reynolds numbers $Re_\\tau\\approx 395$ and 590 provided\naccurate data for training and testing. The two sets of models use different\nnetwork architectures. One of the architectures uses tensor basis neural\nnetworks (TBNN) and embeds the simplified analytical model form of the general\neffective-viscosity hypothesis, thus incorporating the Galilean, rotational and\nreflectional invariances. The other architecture is that of a relatively simple\nnetwork, that is able to incorporate the Galilean invariance only. However,\nthis simpler architecture has better feature extraction capacity owing to its\nability to establish relations between and extract information from\ncross-components of the integrity basis tensors and the SGS stresses. Both sets\nof models are used to predict the SGS stresses for feature datasets generated\nwith different filter widths, and at different Reynolds numbers. It is shown\nthat due to the simpler model's better feature learning capabilities, it\noutperforms the invariance embedded model in statistical performance metrics.\nIn a priori tests, both sets of models provide similar levels of dissipation\nand backscatter. Based on the test results, both sets of models should be\nusable in a posteriori actual LESs.\n","authors":["Rikhi Bose","Arunabha M. Roy"],"pdf_url":"https://arxiv.org/pdf/2307.10060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10053v1","updated":"2023-07-19T15:26:18Z","published":"2023-07-19T15:26:18Z","title":"Convergence Guarantees for Stochastic Subgradient Methods in Nonsmooth\n Nonconvex Optimization","summary":" In this paper, we investigate the convergence properties of the stochastic\ngradient descent (SGD) method and its variants, especially in training neural\nnetworks built from nonsmooth activation functions. We develop a novel\nframework that assigns different timescales to stepsizes for updating the\nmomentum terms and variables, respectively. Under mild conditions, we prove the\nglobal convergence of our proposed framework in both single-timescale and\ntwo-timescale cases. We show that our proposed framework encompasses a wide\nrange of well-known SGD-type methods, including heavy-ball SGD, SignSGD, Lion,\nnormalized SGD and clipped SGD. Furthermore, when the objective function adopts\na finite-sum formulation, we prove the convergence properties for these\nSGD-type methods based on our proposed framework. In particular, we prove that\nthese SGD-type methods find the Clarke stationary points of the objective\nfunction with randomly chosen stepsizes and initial points under mild\nassumptions. Preliminary numerical experiments demonstrate the high efficiency\nof our analyzed SGD-type methods.\n","authors":["Nachuan Xiao","Xiaoyin Hu","Kim-Chuan Toh"],"pdf_url":"https://arxiv.org/pdf/2307.10053v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2303.15592v2","updated":"2023-07-19T15:16:21Z","published":"2023-03-27T20:49:42Z","title":"Uncovering Bias in Personal Informatics","summary":" Personal informatics (PI) systems, powered by smartphones and wearables,\nenable people to lead healthier lifestyles by providing meaningful and\nactionable insights that break down barriers between users and their health\ninformation. Today, such systems are used by billions of users for monitoring\nnot only physical activity and sleep but also vital signs and women's and heart\nhealth, among others. Despite their widespread usage, the processing of\nsensitive PI data may suffer from biases, which may entail practical and\nethical implications. In this work, we present the first comprehensive\nempirical and analytical study of bias in PI systems, including biases in raw\ndata and in the entire machine learning life cycle. We use the most detailed\nframework to date for exploring the different sources of bias and find that\nbiases exist both in the data generation and the model learning and\nimplementation streams. According to our results, the most affected minority\ngroups are users with health issues, such as diabetes, joint issues, and\nhypertension, and female users, whose data biases are propagated or even\namplified by learning models, while intersectional biases can also be observed.\n","authors":["Sofia Yfantidou","Pavlos Sermpezis","Athena Vakali","Ricardo Baeza-Yates"],"pdf_url":"https://arxiv.org/pdf/2303.15592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10026v1","updated":"2023-07-19T15:11:04Z","published":"2023-07-19T15:11:04Z","title":"Contextual Reliability: When Different Features Matter in Different\n Contexts","summary":" Deep neural networks often fail catastrophically by relying on spurious\ncorrelations. Most prior work assumes a clear dichotomy into spurious and\nreliable features; however, this is often unrealistic. For example, most of the\ntime we do not want an autonomous car to simply copy the speed of surrounding\ncars -- we don't want our car to run a red light if a neighboring car does so.\nHowever, we cannot simply enforce invariance to next-lane speed, since it could\nprovide valuable information about an unobservable pedestrian at a crosswalk.\nThus, universally ignoring features that are sometimes (but not always)\nreliable can lead to non-robust performance. We formalize a new setting called\ncontextual reliability which accounts for the fact that the \"right\" features to\nuse may vary depending on the context. We propose and analyze a two-stage\nframework called Explicit Non-spurious feature Prediction (ENP) which first\nidentifies the relevant features to use for a given context, then trains a\nmodel to rely exclusively on these features. Our work theoretically and\nempirically demonstrates the advantages of ENP over existing methods and\nprovides new benchmarks for contextual reliability.\n","authors":["Gaurav Ghosal","Amrith Setlur","Daniel S. Brown","Anca D. Dragan","Aditi Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2307.10026v1.pdf","comment":"ICML 2023 Camera Ready Version"},{"id":"http://arxiv.org/abs/2307.10022v1","updated":"2023-07-19T15:05:55Z","published":"2023-07-19T15:05:55Z","title":"Europepolls: A Dataset of Country-Level Opinion Polling Data for the\n European Union and the UK","summary":" I propose an open dataset of country-level historical opinion polling data\nfor the European Union and the UK. The dataset aims to fill a gap in available\nopinion polling data for the European Union. Some existing datasets are\nrestricted to the past five years, limiting research opportunities. At the same\ntime, some larger proprietary datasets exist but are available only in a visual\npreprocessed time series format. Finally, while other large datasets for\nindividual countries might exist, these could be inaccessible due to language\nbarriers. The data was gathered from Wikipedia, and preprocessed using the\npandas library. Both the raw and the preprocessed data are in the .csv format.\nI hope that given the recent advances in LLMs and deep learning in general,\nthis large dataset will enable researchers to uncover complex interactions\nbetween multimodal data (news articles, economic indicators, social media) and\nvoting behavior. The raw data, the preprocessed data, and the preprocessing\nscripts are available on GitHub.\n","authors":["Konstantinos Pitas"],"pdf_url":"https://arxiv.org/pdf/2307.10022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15585v2","updated":"2023-07-19T15:00:06Z","published":"2023-03-27T20:28:26Z","title":"Beyond Accuracy: A Critical Review of Fairness in Machine Learning for\n Mobile and Wearable Computing","summary":" The field of mobile and wearable computing is undergoing a revolutionary\nintegration of machine learning. Devices can now diagnose diseases, predict\nheart irregularities, and unlock the full potential of human cognition.\nHowever, the underlying algorithms powering these predictions are not immune to\nbiases with respect to sensitive attributes (e.g., gender, race), leading to\ndiscriminatory outcomes. The goal of this work is to explore the extent to\nwhich the mobile and wearable computing community has adopted ways of reporting\ninformation about datasets and models to surface and, eventually, counter\nbiases. Our systematic review of papers published in the Proceedings of the ACM\nInteractive, Mobile, Wearable and Ubiquitous Technologies (IMWUT) journal from\n2018-2022 indicates that, while there has been progress made on algorithmic\nfairness, there is still ample room for growth. Our findings show that only a\nsmall portion (5%) of published papers adheres to modern fairness reporting,\nwhile the overwhelming majority thereof focuses on accuracy or error metrics.\nTo generalize these results across venues of similar scope, we analyzed recent\nproceedings of ACM MobiCom, MobiSys, and SenSys, IEEE Pervasive, and IEEE\nTransactions on Mobile Computing Computing, and found no deviation from our\nprimary result. In light of these findings, our work provides practical\nguidelines for the design and development of mobile and wearable technologies\nthat not only strive for accuracy but also fairness.\n","authors":["Sofia Yfantidou","Marios Constantinides","Dimitris Spathis","Athena Vakali","Daniele Quercia","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2303.15585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.09753v3","updated":"2023-07-19T14:55:17Z","published":"2022-02-20T07:42:00Z","title":"Finite-Time Analysis of Natural Actor-Critic for POMDPs","summary":" We consider the reinforcement learning problem for partially observed Markov\ndecision processes (POMDPs) with large or even countably infinite state spaces,\nwhere the controller has access to only noisy observations of the underlying\ncontrolled Markov chain. We consider a natural actor-critic method that employs\na finite internal memory for policy parameterization, and a multi-step temporal\ndifference learning algorithm for policy evaluation. We establish, to the best\nof our knowledge, the first non-asymptotic global convergence of actor-critic\nmethods for partially observed systems under function approximation. In\nparticular, in addition to the function approximation and statistical errors\nthat also arise in MDPs, we explicitly characterize the error due to the use of\nfinite-state controllers. This additional error is stated in terms of the total\nvariation distance between the traditional belief state in POMDPs and the\nposterior distribution of the hidden state when using a finite-state\ncontroller. Further, we show that this error can be made small in the case of\nsliding-block controllers by using larger block sizes.\n","authors":["Semih Cayci","Niao He","R. Srikant"],"pdf_url":"https://arxiv.org/pdf/2202.09753v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06385v2","updated":"2023-07-19T14:51:37Z","published":"2023-07-12T18:13:58Z","title":"Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event\n Localization","summary":" Audio-Visual Event Localization (AVEL) is the task of temporally localizing\nand classifying \\emph{audio-visual events}, i.e., events simultaneously visible\nand audible in a video. In this paper, we solve AVEL in a weakly-supervised\nsetting, where only video-level event labels (their presence/absence, but not\ntheir locations in time) are available as supervision for training. Our idea is\nto use a base model to estimate labels on the training data at a finer temporal\nresolution than at the video level and re-train the model with these labels.\nI.e., we determine the subset of labels for each \\emph{slice} of frames in a\ntraining video by (i) replacing the frames outside the slice with those from a\nsecond video having no overlap in video-level labels, and (ii) feeding this\nsynthetic video into the base model to extract labels for just the slice in\nquestion. To handle the out-of-distribution nature of our synthetic videos, we\npropose an auxiliary objective for the base model that induces more reliable\npredictions of the localized event labels as desired. Our three-stage pipeline\noutperforms several existing AVEL methods with no architectural changes and\nimproves performance on a related weakly-supervised task as well.\n","authors":["Kalyan Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2307.06385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.07677v4","updated":"2023-07-19T14:45:15Z","published":"2021-06-14T18:01:08Z","title":"Planning to Fairly Allocate: Probabilistic Fairness in the Restless\n Bandit Setting","summary":" Restless and collapsing bandits are often used to model budget-constrained\nresource allocation in settings where arms have action-dependent transition\nprobabilities, such as the allocation of health interventions among patients.\nHowever, state-of-the-art Whittle-index-based approaches to this planning\nproblem either do not consider fairness among arms, or incentivize fairness\nwithout guaranteeing it. We thus introduce ProbFair, a probabilistically fair\npolicy that maximizes total expected reward and satisfies the budget constraint\nwhile ensuring a strictly positive lower bound on the probability of being\npulled at each timestep. We evaluate our algorithm on a real-world application,\nwhere interventions support continuous positive airway pressure (CPAP) therapy\nadherence among patients, as well as on a broader class of synthetic transition\nmatrices. We find that ProbFair preserves utility while providing fairness\nguarantees.\n","authors":["Christine Herlihy","Aviva Prins","Aravind Srinivasan","John P. Dickerson"],"pdf_url":"https://arxiv.org/pdf/2106.07677v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11103v2","updated":"2023-07-19T14:42:10Z","published":"2023-03-20T13:40:11Z","title":"Sionna RT: Differentiable Ray Tracing for Radio Propagation Modeling","summary":" Sionna is a GPU-accelerated open-source library for link-level simulations\nbased on TensorFlow. Since release v0.14 it integrates a differentiable ray\ntracer (RT) for the simulation of radio wave propagation. This unique feature\nallows for the computation of gradients of the channel impulse response and\nother related quantities with respect to many system and environment\nparameters, such as material properties, antenna patterns, array geometries, as\nwell as transmitter and receiver orientations and positions. In this paper, we\noutline the key components of Sionna RT and showcase example applications such\nas learning radio materials and optimizing transmitter orientations by gradient\ndescent. While classic ray tracing is a crucial tool for 6G research topics\nlike reconfigurable intelligent surfaces, integrated sensing and\ncommunications, as well as user localization, differentiable ray tracing is a\nkey enabler for many novel and exciting research directions, for example,\ndigital twins.\n","authors":["Jakob Hoydis","Fayçal Aït Aoudia","Sebastian Cammerer","Merlin Nimier-David","Nikolaus Binder","Guillermo Marcus","Alexander Keller"],"pdf_url":"https://arxiv.org/pdf/2303.11103v2.pdf","comment":"5 pages, 5 figures, update reflects new features of Sionna RT\n introduced in release v0.15"},{"id":"http://arxiv.org/abs/2208.07734v6","updated":"2023-07-19T14:39:54Z","published":"2022-08-16T13:09:25Z","title":"Data Augmentation is a Hyperparameter: Cherry-picked Self-Supervision\n for Unsupervised Anomaly Detection is Creating the Illusion of Success","summary":" Self-supervised learning (SSL) has emerged as a promising alternative to\ncreate supervisory signals to real-world problems, avoiding the extensive cost\nof manual labeling. SSL is particularly attractive for unsupervised tasks such\nas anomaly detection (AD), where labeled anomalies are rare or often\nnonexistent. A large catalog of augmentation functions has been used for\nSSL-based AD (SSAD) on image data, and recent works have reported that the type\nof augmentation has a significant impact on accuracy. Motivated by those, this\nwork sets out to put image-based SSAD under a larger lens and investigate the\nrole of data augmentation in SSAD. Through extensive experiments on 3 different\ndetector models and across 420 AD tasks, we provide comprehensive numerical and\nvisual evidences that the alignment between data augmentation and\nanomaly-generating mechanism is the key to the success of SSAD, and in the lack\nthereof, SSL may even impair accuracy. To the best of our knowledge, this is\nthe first meta-analysis on the role of data augmentation in SSAD.\n","authors":["Jaemin Yoo","Tiancheng Zhao","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2208.07734v6.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2307.10003v1","updated":"2023-07-19T14:23:26Z","published":"2023-07-19T14:23:26Z","title":"TbExplain: A Text-based Explanation Method for Scene Classification\n Models with the Statistical Prediction Correction","summary":" The field of Explainable Artificial Intelligence (XAI) aims to improve the\ninterpretability of black-box machine learning models. Building a heatmap based\non the importance value of input features is a popular method for explaining\nthe underlying functions of such models in producing their predictions.\nHeatmaps are almost understandable to humans, yet they are not without flaws.\nNon-expert users, for example, may not fully understand the logic of heatmaps\n(the logic in which relevant pixels to the model's prediction are highlighted\nwith different intensities or colors). Additionally, objects and regions of the\ninput image that are relevant to the model prediction are frequently not\nentirely differentiated by heatmaps. In this paper, we propose a framework\ncalled TbExplain that employs XAI techniques and a pre-trained object detector\nto present text-based explanations of scene classification models. Moreover,\nTbExplain incorporates a novel method to correct predictions and textually\nexplain them based on the statistics of objects in the input image when the\ninitial prediction is unreliable. To assess the trustworthiness and validity of\nthe text-based explanations, we conducted a qualitative experiment, and the\nfindings indicated that these explanations are sufficiently reliable.\nFurthermore, our quantitative and qualitative experiments on TbExplain with\nscene classification datasets reveal an improvement in classification accuracy\nover ResNet variants.\n","authors":["Amirhossein Aminimehr","Pouya Khani","Amirali Molaei","Amirmohammad Kazemeini","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.10003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10403v2","updated":"2023-07-19T14:23:17Z","published":"2022-05-20T18:44:06Z","title":"Tackling Provably Hard Representative Selection via Graph Neural\n Networks","summary":" Representative Selection (RS) is the problem of finding a small subset of\nexemplars from a dataset that is representative of the dataset. In this paper,\nwe study RS for attributed graphs, and focus on finding representative nodes\nthat optimize the accuracy of a model trained on the selected representatives.\nTheoretically, we establish a new hardness result forRS (in the absence of a\ngraph structure) by proving that a particular, highly practical variant of it\n(RS for Learning) is hard to approximate in polynomial time within any\nreasonable factor, which implies a significant potential gap between the\noptimum solution of widely-used surrogate functions and the actual accuracy of\nthe model. We then study the setting where a (homophilous) graph structure is\navailable, or can be constructed, between the data points.We show that with an\nappropriate modeling approach, the presence of such a structure can turn a hard\nRS (for learning) problem into one that can be effectively solved. To this end,\nwe develop RS-GNN, a representation learning-based RS model based on Graph\nNeural Networks. Empirically, we demonstrate the effectiveness of RS-GNN on\nproblems with predefined graph structures as well as problems with graphs\ninduced from node feature similarities, by showing that RS-GNN achieves\nsignificant improvements over established baselines on a suite of eight\nbenchmarks.\n","authors":["Mehran Kazemi","Anton Tsitsulin","Hossein Esfandiari","MohammadHossein Bateni","Deepak Ramachandran","Bryan Perozzi","Vahab Mirrokni"],"pdf_url":"https://arxiv.org/pdf/2205.10403v2.pdf","comment":"Accepted at the Transactions of Machine Learning Research (TMLR)\n Journal"},{"id":"http://arxiv.org/abs/2307.08913v2","updated":"2023-07-19T14:18:00Z","published":"2023-07-18T01:16:23Z","title":"Towards the Sparseness of Projection Head in Self-Supervised Learning","summary":" In recent years, self-supervised learning (SSL) has emerged as a promising\napproach for extracting valuable representations from unlabeled data. One\nsuccessful SSL method is contrastive learning, which aims to bring positive\nexamples closer while pushing negative examples apart. Many current contrastive\nlearning approaches utilize a parameterized projection head. Through a\ncombination of empirical analysis and theoretical investigation, we provide\ninsights into the internal mechanisms of the projection head and its\nrelationship with the phenomenon of dimensional collapse. Our findings\ndemonstrate that the projection head enhances the quality of representations by\nperforming contrastive loss in a projected subspace. Therefore, we propose an\nassumption that only a subset of features is necessary when minimizing the\ncontrastive loss of a mini-batch of data. Theoretical analysis further suggests\nthat a sparse projection head can enhance generalization, leading us to\nintroduce SparseHead - a regularization term that effectively constrains the\nsparsity of the projection head, and can be seamlessly integrated with any\nself-supervised learning (SSL) approaches. Our experimental results validate\nthe effectiveness of SparseHead, demonstrating its ability to improve the\nperformance of existing contrastive methods.\n","authors":["Zeen Song","Xingzhe Su","Jingyao Wang","Wenwen Qiang","Changwen Zheng","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2307.08913v2.pdf","comment":"9 pages,3 figures"},{"id":"http://arxiv.org/abs/2305.15851v2","updated":"2023-07-19T14:16:22Z","published":"2023-05-25T08:43:11Z","title":"On sampling determinantal and Pfaffian point processes on a quantum\n computer","summary":" DPPs were introduced by Macchi as a model in quantum optics the 1970s. Since\nthen, they have been widely used as models and subsampling tools in statistics\nand computer science. Most applications require sampling from a DPP, and given\ntheir quantum origin, it is natural to wonder whether sampling a DPP on a\nquantum computer is easier than on a classical one. We focus here on DPPs over\na finite state space, which are distributions over the subsets of\n$\\{1,\\dots,N\\}$ parametrized by an $N\\times N$ Hermitian kernel matrix. Vanilla\nsampling consists in two steps, of respective costs $\\mathcal{O}(N^3)$ and\n$\\mathcal{O}(Nr^2)$ operations on a classical computer, where $r$ is the rank\nof the kernel matrix. A large first part of the current paper consists in\nexplaining why the state-of-the-art in quantum simulation of fermionic systems\nalready yields quantum DPP sampling algorithms. We then modify existing quantum\ncircuits, and discuss their insertion in a full DPP sampling pipeline that\nstarts from practical kernel specifications. The bottom line is that, with $P$\n(classical) parallel processors, we can divide the preprocessing cost by $P$\nand build a quantum circuit with $\\mathcal{O}(Nr)$ gates that sample a given\nDPP, with depth varying from $\\mathcal{O}(N)$ to $\\mathcal{O}(r\\log N)$\ndepending on qubit-communication constraints on the target machine. We also\nconnect existing work on the simulation of superconductors to Pfaffian point\nprocesses, which generalize DPPs and would be a natural addition to the machine\nlearner's toolbox. Finally, the circuits are empirically validated on a\nclassical simulator and on 5-qubit machines.\n","authors":["Rémi Bardenet","Michaël Fanuel","Alexandre Feller"],"pdf_url":"https://arxiv.org/pdf/2305.15851v2.pdf","comment":"48 pages, 8 figures. Additional results about parity of cardinality\n of PfPP samples"},{"id":"http://arxiv.org/abs/2307.09994v1","updated":"2023-07-19T13:58:01Z","published":"2023-07-19T13:58:01Z","title":"Impact of Disentanglement on Pruning Neural Networks","summary":" Deploying deep learning neural networks on edge devices, to accomplish task\nspecific objectives in the real-world, requires a reduction in their memory\nfootprint, power consumption, and latency. This can be realized via efficient\nmodel compression. Disentangled latent representations produced by variational\nautoencoder (VAE) networks are a promising approach for achieving model\ncompression because they mainly retain task-specific information, discarding\nuseless information for the task at hand. We make use of the Beta-VAE framework\ncombined with a standard criterion for pruning to investigate the impact of\nforcing the network to learn disentangled representations on the pruning\nprocess for the task of classification. In particular, we perform experiments\non MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose\na path forward for future works.\n","authors":["Carl Shneider","Peyman Rostami","Anis Kacem","Nilotpal Sinha","Abd El Rahman Shabayek","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2307.09994v1.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2307.08347v2","updated":"2023-07-19T13:55:32Z","published":"2023-07-17T09:38:41Z","title":"M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models\n and Latent Space Geometry Optimization","summary":" Medical vision-language models enable co-learning and integrating features\nfrom medical imaging and clinical text. However, these models are not easy to\ntrain and the latent representation space can be complex. Here we propose a\nnovel way for pre-training and regularising medical vision-language models. The\nproposed method, named Medical vision-language pre-training with Frozen\nlanguage models and Latent spAce Geometry optimization (M-FLAG), leverages a\nfrozen language model for training stability and efficiency and introduces a\nnovel orthogonality loss to harmonize the latent space geometry. We demonstrate\nthe potential of the pre-trained model on three downstream tasks: medical image\nclassification, segmentation, and object detection. Extensive experiments\nacross five public datasets demonstrate that M-FLAG significantly outperforms\nexisting medical vision-language pre-training approaches and reduces the number\nof parameters by 78\\%. Notably, M-FLAG achieves outstanding performance on the\nsegmentation task while using only 1\\% of the RSNA dataset, even outperforming\nImageNet pre-trained models that have been fine-tuned using 100\\% of the data.\n","authors":["Che Liu","Sibo Cheng","Chen Chen","Mengyun Qiao","Weitong Zhang","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2307.08347v2.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.09989v1","updated":"2023-07-19T13:49:35Z","published":"2023-07-19T13:49:35Z","title":"UniMatch: A Unified User-Item Matching Framework for the Multi-purpose\n Merchant Marketing","summary":" When doing private domain marketing with cloud services, the merchants\nusually have to purchase different machine learning models for the multiple\nmarketing purposes, leading to a very high cost. We present a unified user-item\nmatching framework to simultaneously conduct item recommendation and user\ntargeting with just one model. We empirically demonstrate that the above\nconcurrent modeling is viable via modeling the user-item interaction matrix\nwith the multinomial distribution, and propose a bidirectional bias-corrected\nNCE loss for the implementation. The proposed loss function guides the model to\nlearn the user-item joint probability $p(u,i)$ instead of the conditional\nprobability $p(i|u)$ or $p(u|i)$ through correcting both the users and items'\nbiases caused by the in-batch negative sampling. In addition, our framework is\nmodel-agnostic enabling a flexible adaptation of different model architectures.\nExtensive experiments demonstrate that our framework results in significant\nperformance gains in comparison with the state-of-the-art methods, with greatly\nreduced cost on computing resources and daily maintenance.\n","authors":["Qifang Zhao","Tianyu Li","Meng Du","Yu Jiang","Qinghui Sun","Zhongyao Wang","Hong Liu","Huan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09988v1","updated":"2023-07-19T13:49:12Z","published":"2023-07-19T13:49:12Z","title":"TinyTrain: Deep Neural Network Training at the Extreme Edge","summary":" On-device training is essential for user personalisation and privacy. With\nthe pervasiveness of IoT devices and microcontroller units (MCU), this task\nbecomes more challenging due to the constrained memory and compute resources,\nand the limited availability of labelled user data. Nonetheless, prior works\nneglect the data scarcity issue, require excessively long training time (e.g. a\nfew hours), or induce substantial accuracy loss ($\\geq$10\\%). We propose\nTinyTrain, an on-device training approach that drastically reduces training\ntime by selectively updating parts of the model and explicitly coping with data\nscarcity. TinyTrain introduces a task-adaptive sparse-update method that\ndynamically selects the layer/channel based on a multi-objective criterion that\njointly captures user data, the memory, and the compute capabilities of the\ntarget device, leading to high accuracy on unseen tasks with reduced\ncomputation and memory footprint. TinyTrain outperforms vanilla fine-tuning of\nthe entire network by 3.6-5.0\\% in accuracy, while reducing the backward-pass\nmemory and computation cost by up to 2,286$\\times$ and 7.68$\\times$,\nrespectively. Targeting broadly used real-world edge devices, TinyTrain\nachieves 9.5$\\times$ faster and 3.5$\\times$ more energy-efficient training over\nstatus-quo approaches, and 2.8$\\times$ smaller memory footprint than SOTA\napproaches, while remaining within the 1 MB memory envelope of MCU-grade\nplatforms.\n","authors":["Young D. Kwon","Rui Li","Stylianos I. Venieris","Jagmohan Chauhan","Nicholas D. Lane","Cecilia Mascolo"],"pdf_url":"https://arxiv.org/pdf/2307.09988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14518v2","updated":"2023-07-19T13:48:46Z","published":"2023-02-28T12:13:57Z","title":"Generalization Error Bounds for Noisy, Iterative Algorithms via Maximal\n Leakage","summary":" We adopt an information-theoretic framework to analyze the generalization\nbehavior of the class of iterative, noisy learning algorithms. This class is\nparticularly suitable for study under information-theoretic metrics as the\nalgorithms are inherently randomized, and it includes commonly used algorithms\nsuch as Stochastic Gradient Langevin Dynamics (SGLD). Herein, we use the\nmaximal leakage (equivalently, the Sibson mutual information of order infinity)\nmetric, as it is simple to analyze, and it implies both bounds on the\nprobability of having a large generalization error and on its expected value.\nWe show that, if the update function (e.g., gradient) is bounded in $L_2$-norm\nand the additive noise is isotropic Gaussian noise, then one can obtain an\nupper-bound on maximal leakage in semi-closed form. Furthermore, we demonstrate\nhow the assumptions on the update function affect the optimal (in the sense of\nminimizing the induced maximal leakage) choice of the noise. Finally, we\ncompute explicit tight upper bounds on the induced maximal leakage for other\nscenarios of interest.\n","authors":["Ibrahim Issa","Amedeo Roberto Esposito","Michael Gastpar"],"pdf_url":"https://arxiv.org/pdf/2302.14518v2.pdf","comment":"Updated to fix an error in Theorem 4 (asymptotic analysis)"},{"id":"http://arxiv.org/abs/2210.14037v2","updated":"2023-07-19T13:43:07Z","published":"2022-10-25T14:13:53Z","title":"Revisiting Softmax for Uncertainty Approximation in Text Classification","summary":" Uncertainty approximation in text classification is an important area with\napplications in domain adaptation and interpretability. One of the most widely\nused uncertainty approximation methods is Monte Carlo (MC) Dropout, which is\ncomputationally expensive as it requires multiple forward passes through the\nmodel. A cheaper alternative is to simply use the softmax based on a single\nforward pass without dropout to estimate model uncertainty. However, prior work\nhas indicated that these predictions tend to be overconfident. In this paper,\nwe perform a thorough empirical analysis of these methods on five datasets with\ntwo base neural architectures in order to identify the trade-offs between the\ntwo. We compare both softmax and an efficient version of MC Dropout on their\nuncertainty approximations and downstream text classification performance,\nwhile weighing their runtime (cost) against performance (benefit). We find\nthat, while MC dropout produces the best uncertainty approximations, using a\nsimple softmax leads to competitive and in some cases better uncertainty\nestimation for text classification at a much lower computational cost,\nsuggesting that softmax can in fact be a sufficient uncertainty estimate when\ncomputational resources are a concern.\n","authors":["Andreas Nugaard Holm","Dustin Wright","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2210.14037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09977v1","updated":"2023-07-19T13:33:43Z","published":"2023-07-19T13:33:43Z","title":"Learner Referral for Cost-Effective Federated Learning Over Hierarchical\n IoT Networks","summary":" The paradigm of federated learning (FL) to address data privacy concerns by\nlocally training parameters on resource-constrained clients in a distributed\nmanner has garnered significant attention. Nonetheless, FL is not applicable\nwhen not all clients within the coverage of the FL server are registered with\nthe FL network. To bridge this gap, this paper proposes joint learner referral\naided federated client selection (LRef-FedCS), along with communications and\ncomputing resource scheduling, and local model accuracy optimization (LMAO)\nmethods. These methods are designed to minimize the cost incurred by the\nworst-case participant and ensure the long-term fairness of FL in hierarchical\nInternet of Things (HieIoT) networks. Utilizing the Lyapunov optimization\ntechnique, we reformulate the original problem into a stepwise joint\noptimization problem (JOP). Subsequently, to tackle the mixed-integer\nnon-convex JOP, we separatively and iteratively address LRef-FedCS and LMAO\nthrough the centralized method and self-adaptive global best harmony search\n(SGHS) algorithm, respectively. To enhance scalability, we further propose a\ndistributed LRef-FedCS approach based on a matching game to replace the\ncentralized method described above. Numerical simulations and experimental\nresults on the MNIST/CIFAR-10 datasets demonstrate that our proposed LRef-FedCS\napproach could achieve a good balance between pursuing high global accuracy and\nreducing cost.\n","authors":["Yulan Gao","Ziqiang Ye","Yue Xiao","Wei Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.09977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03587v2","updated":"2023-07-19T13:23:29Z","published":"2023-07-07T13:29:07Z","title":"BOF-UCB: A Bayesian-Optimistic Frequentist Algorithm for Non-Stationary\n Contextual Bandits","summary":" We propose a novel Bayesian-Optimistic Frequentist Upper Confidence Bound\n(BOF-UCB) algorithm for stochastic contextual linear bandits in non-stationary\nenvironments. This unique combination of Bayesian and frequentist principles\nenhances adaptability and performance in dynamic settings. The BOF-UCB\nalgorithm utilizes sequential Bayesian updates to infer the posterior\ndistribution of the unknown regression parameter, and subsequently employs a\nfrequentist approach to compute the Upper Confidence Bound (UCB) by maximizing\nthe expected reward over the posterior distribution. We provide theoretical\nguarantees of BOF-UCB's performance and demonstrate its effectiveness in\nbalancing exploration and exploitation on synthetic datasets and classical\ncontrol tasks in a reinforcement learning setting. Our results show that\nBOF-UCB outperforms existing methods, making it a promising solution for\nsequential decision-making in non-stationary environments.\n","authors":["Nicklas Werge","Abdullah Akgül","Melih Kandemir"],"pdf_url":"https://arxiv.org/pdf/2307.03587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09946v2","updated":"2023-07-19T13:15:08Z","published":"2023-05-17T04:56:11Z","title":"AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for\n Survival Outcome Prediction from PET/CT Images","summary":" Survival prediction is a major concern for cancer management. Deep survival\nmodels based on deep learning have been widely adopted to perform end-to-end\nsurvival prediction from medical images. Recent deep survival models achieved\npromising performance by jointly performing tumor segmentation with survival\nprediction, where the models were guided to extract tumor-related information\nthrough Multi-Task Learning (MTL). However, these deep survival models have\ndifficulties in exploring out-of-tumor prognostic information. In addition,\nexisting deep survival models are unable to effectively leverage multi-modality\nimages. Empirically-designed fusion strategies were commonly adopted to fuse\nmulti-modality information via task-specific manually-designed networks, thus\nlimiting the adaptability to different scenarios. In this study, we propose an\nAdaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival\nprediction from PET/CT images. Instead of adopting MTL, we propose a novel\nSegmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained\nfor tumor segmentation and survival prediction sequentially in two stages. This\nstrategy enables the AdaMSS to focus on tumor regions in the first stage and\ngradually expand its focus to include other prognosis-related regions in the\nsecond stage. We also propose a data-driven strategy to fuse multi-modality\ninformation, which realizes adaptive optimization of fusion strategies based on\ntraining data during training. With the SSL and data-driven fusion strategies,\nour AdaMSS is designed as an adaptive model that can self-adapt its focus\nregions and fusion strategy for different training stages. Extensive\nexperiments with two large clinical datasets show that our AdaMSS outperforms\nstate-of-the-art survival prediction methods.\n","authors":["Mingyuan Meng","Bingxin Gu","Michael Fulham","Shaoli Song","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2305.09946v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2307.09964v1","updated":"2023-07-19T13:14:47Z","published":"2023-07-19T13:14:47Z","title":"Towards green AI-based software systems: an architecture-centric\n approach (GAISSA)","summary":" Nowadays, AI-based systems have achieved outstanding results and have\noutperformed humans in different domains. However, the processes of training AI\nmodels and inferring from them require high computational resources, which pose\na significant challenge in the current energy efficiency societal demand. To\ncope with this challenge, this research project paper describes the main\nvision, goals, and expected outcomes of the GAISSA project. The GAISSA project\naims at providing data scientists and software engineers tool-supported,\narchitecture-centric methods for the modelling and development of green\nAI-based systems. Although the project is in an initial stage, we describe the\ncurrent research results, which illustrate the potential to achieve GAISSA\nobjectives.\n","authors":["Silverio Martínez-Fernández","Xavier Franch","Francisco Durán"],"pdf_url":"https://arxiv.org/pdf/2307.09964v1.pdf","comment":"Accepted for publication as full paper - 2023 49th Euromicro\n Conference Series on Software Engineering and Advanced Applications (SEAA)"},{"id":"http://arxiv.org/abs/2210.06226v2","updated":"2023-07-19T13:08:21Z","published":"2022-10-12T14:15:39Z","title":"Alpha-divergence Variational Inference Meets Importance Weighted\n Auto-Encoders: Methodology and Asymptotics","summary":" Several algorithms involving the Variational R\\'enyi (VR) bound have been\nproposed to minimize an alpha-divergence between a target posterior\ndistribution and a variational distribution. Despite promising empirical\nresults, those algorithms resort to biased stochastic gradient descent\nprocedures and thus lack theoretical guarantees. In this paper, we formalize\nand study the VR-IWAE bound, a generalization of the Importance Weighted\nAuto-Encoder (IWAE) bound. We show that the VR-IWAE bound enjoys several\ndesirable properties and notably leads to the same stochastic gradient descent\nprocedure as the VR bound in the reparameterized case, but this time by relying\non unbiased gradient estimators. We then provide two complementary theoretical\nanalyses of the VR-IWAE bound and thus of the standard IWAE bound. Those\nanalyses shed light on the benefits or lack thereof of these bounds. Lastly, we\nillustrate our theoretical claims over toy and real-data examples.\n","authors":["Kamélia Daudel","Joe Benton","Yuyang Shi","Arnaud Doucet"],"pdf_url":"https://arxiv.org/pdf/2210.06226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2001.05887v4","updated":"2023-07-19T12:58:18Z","published":"2020-01-16T15:24:26Z","title":"MixPath: A Unified Approach for One-shot Neural Architecture Search","summary":" Blending multiple convolutional kernels is proved advantageous in neural\narchitecture design. However, current two-stage neural architecture search\nmethods are mainly limited to single-path search spaces. How to efficiently\nsearch models of multi-path structures remains a difficult problem. In this\npaper, we are motivated to train a one-shot multi-path supernet to accurately\nevaluate the candidate architectures. Specifically, we discover that in the\nstudied search spaces, feature vectors summed from multiple paths are nearly\nmultiples of those from a single path. Such disparity perturbs the supernet\ntraining and its ranking ability. Therefore, we propose a novel mechanism\ncalled Shadow Batch Normalization (SBN) to regularize the disparate feature\nstatistics. Extensive experiments prove that SBNs are capable of stabilizing\nthe optimization and improving ranking performance. We call our unified\nmulti-path one-shot approach as MixPath, which generates a series of models\nthat achieve state-of-the-art results on ImageNet.\n","authors":["Xiangxiang Chu","Shun Lu","Xudong Li","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2001.05887v4.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09955v1","updated":"2023-07-19T12:51:28Z","published":"2023-07-19T12:51:28Z","title":"XSkill: Cross Embodiment Skill Discovery","summary":" Human demonstration videos are a widely available data source for robot\nlearning and an intuitive user interface for expressing desired behavior.\nHowever, directly extracting reusable robot manipulation skills from\nunstructured human videos is challenging due to the big embodiment difference\nand unobserved action parameters. To bridge this embodiment gap, this paper\nintroduces XSkill, an imitation learning framework that 1) discovers a\ncross-embodiment representation called skill prototypes purely from unlabeled\nhuman and robot manipulation videos, 2) transfers the skill representation to\nrobot actions using conditional diffusion policy, and finally, 3) composes the\nlearned skill to accomplish unseen tasks specified by a human prompt video. Our\nexperiments in simulation and real-world environments show that the discovered\nskill prototypes facilitate both skill transfer and composition for unseen\ntasks, resulting in a more general and scalable imitation learning framework.\nThe performance of XSkill is best understood from the anonymous website:\nhttps://xskillcorl.github.io.\n","authors":["Mengda Xu","Zhenjia Xu","Cheng Chi","Manuela Veloso","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2307.09955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09943v1","updated":"2023-07-19T12:35:16Z","published":"2023-07-19T12:35:16Z","title":"Impatient Bandits: Optimizing for the Long-Term Without Delay","summary":" Recommender systems are a ubiquitous feature of online platforms.\nIncreasingly, they are explicitly tasked with increasing users' long-term\nsatisfaction. In this context, we study a content exploration task, which we\nformalize as a multi-armed bandit problem with delayed rewards. We observe that\nthere is an apparent trade-off in choosing the learning signal: Waiting for the\nfull reward to become available might take several weeks, hurting the rate at\nwhich learning happens, whereas measuring short-term proxy rewards reflects the\nactual long-term goal only imperfectly. We address this challenge in two steps.\nFirst, we develop a predictive model of delayed rewards that incorporates all\ninformation obtained to date. Full observations as well as partial (short or\nmedium-term) outcomes are combined through a Bayesian filter to obtain a\nprobabilistic belief. Second, we devise a bandit algorithm that takes advantage\nof this new predictive model. The algorithm quickly learns to identify content\naligned with long-term success by carefully balancing exploration and\nexploitation. We apply our approach to a podcast recommendation problem, where\nwe seek to identify shows that users engage with repeatedly over two months. We\nempirically validate that our approach results in substantially better\nperformance compared to approaches that either optimize for short-term proxies,\nor wait for the long-term outcome to be fully realized.\n","authors":["Thomas McDonald","Lucas Maystre","Mounia Lalmas","Daniel Russo","Kamil Ciosek"],"pdf_url":"https://arxiv.org/pdf/2307.09943v1.pdf","comment":"Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery\n and Data Mining (KDD '23)"},{"id":"http://arxiv.org/abs/2307.09942v1","updated":"2023-07-19T12:35:09Z","published":"2023-07-19T12:35:09Z","title":"TREEMENT: Interpretable Patient-Trial Matching via Personalized Dynamic\n Tree-Based Memory Network","summary":" Clinical trials are critical for drug development but often suffer from\nexpensive and inefficient patient recruitment. In recent years, machine\nlearning models have been proposed for speeding up patient recruitment via\nautomatically matching patients with clinical trials based on longitudinal\npatient electronic health records (EHR) data and eligibility criteria of\nclinical trials. However, they either depend on trial-specific expert rules\nthat cannot expand to other trials or perform matching at a very general level\nwith a black-box model where the lack of interpretability makes the model\nresults difficult to be adopted.\n To provide accurate and interpretable patient trial matching, we introduce a\npersonalized dynamic tree-based memory network model named TREEMENT. It\nutilizes hierarchical clinical ontologies to expand the personalized patient\nrepresentation learned from sequential EHR data, and then uses an attentional\nbeam-search query learned from eligibility criteria embedding to offer a\ngranular level of alignment for improved performance and interpretability. We\nevaluated TREEMENT against existing models on real-world datasets and\ndemonstrated that TREEMENT outperforms the best baseline by 7% in terms of\nerror reduction in criteria-level matching and achieves state-of-the-art\nresults in its trial-level matching ability. Furthermore, we also show TREEMENT\ncan offer good interpretability to make the model results easier for adoption.\n","authors":["Brandon Theodorou","Cao Xiao","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2307.09942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02486v2","updated":"2023-07-19T12:25:35Z","published":"2023-07-05T17:59:38Z","title":"LongNet: Scaling Transformers to 1,000,000,000 Tokens","summary":" Scaling sequence length has become a critical demand in the era of large\nlanguage models. However, existing methods struggle with either computational\ncomplexity or model expressivity, rendering the maximum sequence length\nrestricted. To address this issue, we introduce LongNet, a Transformer variant\nthat can scale sequence length to more than 1 billion tokens, without\nsacrificing the performance on shorter sequences. Specifically, we propose\ndilated attention, which expands the attentive field exponentially as the\ndistance grows. LongNet has significant advantages: 1) it has a linear\ncomputation complexity and a logarithm dependency between any two tokens in a\nsequence; 2) it can be served as a distributed trainer for extremely long\nsequences; 3) its dilated attention is a drop-in replacement for standard\nattention, which can be seamlessly integrated with the existing\nTransformer-based optimization. Experiments results demonstrate that LongNet\nyields strong performance on both long-sequence modeling and general language\ntasks. Our work opens up new possibilities for modeling very long sequences,\ne.g., treating a whole corpus or even the entire Internet as a sequence.\n","authors":["Jiayu Ding","Shuming Ma","Li Dong","Xingxing Zhang","Shaohan Huang","Wenhui Wang","Nanning Zheng","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.02486v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2302.07265v2","updated":"2023-07-19T12:18:34Z","published":"2023-02-14T18:59:02Z","title":"The Meta-Evaluation Problem in Explainable AI: Identifying Reliable\n Estimators with MetaQuantus","summary":" One of the unsolved challenges in the field of Explainable AI (XAI) is\ndetermining how to most reliably estimate the quality of an explanation method\nin the absence of ground truth explanation labels. Resolving this issue is of\nutmost importance as the evaluation outcomes generated by competing evaluation\nmethods (or ''quality estimators''), which aim at measuring the same property\nof an explanation method, frequently present conflicting rankings. Such\ndisagreements can be challenging for practitioners to interpret, thereby\ncomplicating their ability to select the best-performing explanation method. We\naddress this problem through a meta-evaluation of different quality estimators\nin XAI, which we define as ''the process of evaluating the evaluation method''.\nOur novel framework, MetaQuantus, analyses two complementary performance\ncharacteristics of a quality estimator: its resilience to noise and reactivity\nto randomness, thus circumventing the need for ground truth labels. We\ndemonstrate the effectiveness of our framework through a series of experiments,\ntargeting various open questions in XAI such as the selection and\nhyperparameter optimisation of quality estimators. Our work is released under\nan open-source license (https://github.com/annahedstroem/MetaQuantus) to serve\nas a development tool for XAI- and Machine Learning (ML) practitioners to\nverify and benchmark newly constructed quality estimators in a given\nexplainability context. With this work, we provide the community with clear and\ntheoretically-grounded guidance for identifying reliable evaluation methods,\nthus facilitating reproducibility in the field.\n","authors":["Anna Hedström","Philine Bommer","Kristoffer K. Wickstrøm","Wojciech Samek","Sebastian Lapuschkin","Marina M. -C. Höhne"],"pdf_url":"https://arxiv.org/pdf/2302.07265v2.pdf","comment":"35 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.09933v1","updated":"2023-07-19T12:15:06Z","published":"2023-07-19T12:15:06Z","title":"Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to\n Harness Spurious Features","summary":" To avoid failures on out-of-distribution data, recent works have sought to\nextract features that have a stable or invariant relationship with the label\nacross domains, discarding the \"spurious\" or unstable features whose\nrelationship with the label changes across domains. However, unstable features\noften carry complementary information about the label that could boost\nperformance if used correctly in the test domain. Our main contribution is to\nshow that it is possible to learn how to use these unstable features in the\ntest domain without labels. In particular, we prove that pseudo-labels based on\nstable features provide sufficient guidance for doing so, provided that stable\nand unstable features are conditionally independent given the label. Based on\nthis theoretical insight, we propose Stable Feature Boosting (SFB), an\nalgorithm for: (i) learning a predictor that separates stable and\nconditionally-independent unstable features; and (ii) using the stable-feature\npredictions to adapt the unstable-feature predictions in the test domain.\nTheoretically, we prove that SFB can learn an asymptotically-optimal predictor\nwithout test-domain labels. Empirically, we demonstrate the effectiveness of\nSFB on real and synthetic data.\n","authors":["Cian Eastwood","Shashank Singh","Andrei Liviu Nicolicioiu","Marin Vlastelica","Julius von Kügelgen","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2307.09933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09931v1","updated":"2023-07-19T12:12:17Z","published":"2023-07-19T12:12:17Z","title":"DISA: DIfferentiable Similarity Approximation for Universal Multimodal\n Registration","summary":" Multimodal image registration is a challenging but essential step for\nnumerous image-guided procedures. Most registration algorithms rely on the\ncomputation of complex, frequently non-differentiable similarity metrics to\ndeal with the appearance discrepancy of anatomical structures between imaging\nmodalities. Recent Machine Learning based approaches are limited to specific\nanatomy-modality combinations and do not generalize to new settings. We propose\na generic framework for creating expressive cross-modal descriptors that enable\nfast deformable global registration. We achieve this by approximating existing\nmetrics with a dot-product in the feature space of a small convolutional neural\nnetwork (CNN) which is inherently differentiable can be trained without\nregistered data. Our method is several orders of magnitude faster than local\npatch-based metrics and can be directly applied in clinical settings by\nreplacing the similarity measure with the proposed one. Experiments on three\ndifferent datasets demonstrate that our approach generalizes well beyond the\ntraining data, yielding a broad capture range even on unseen anatomies and\nmodality pairs, without the need for specialized retraining. We make our\ntraining code and data publicly available.\n","authors":["Matteo Ronchetti","Wolfgang Wein","Nassir Navab","Oliver Zettinig","Raphael Prevost"],"pdf_url":"https://arxiv.org/pdf/2307.09931v1.pdf","comment":"This preprint was submitted to MICCAI 2023. The Version of Record of\n this contribution will be published in Springer LNCS"},{"id":"http://arxiv.org/abs/2307.04639v2","updated":"2023-07-19T12:08:51Z","published":"2023-07-10T15:35:31Z","title":"Multimodal brain age estimation using interpretable adaptive\n population-graph learning","summary":" Brain age estimation is clinically important as it can provide valuable\ninformation in the context of neurodegenerative diseases such as Alzheimer's.\nPopulation graphs, which include multimodal imaging information of the subjects\nalong with the relationships among the population, have been used in literature\nalong with Graph Convolutional Networks (GCNs) and have proved beneficial for a\nvariety of medical imaging tasks. A population graph is usually static and\nconstructed manually using non-imaging information. However, graph construction\nis not a trivial task and might significantly affect the performance of the\nGCN, which is inherently very sensitive to the graph structure. In this work,\nwe propose a framework that learns a population graph structure optimized for\nthe downstream task. An attention mechanism assigns weights to a set of imaging\nand non-imaging features (phenotypes), which are then used for edge extraction.\nThe resulting graph is used to train the GCN. The entire pipeline can be\ntrained end-to-end. Additionally, by visualizing the attention weights that\nwere the most important for the graph construction, we increase the\ninterpretability of the graph. We use the UK Biobank, which provides a large\nvariety of neuroimaging and non-imaging phenotypes, to evaluate our method on\nbrain age regression and classification. The proposed method outperforms\ncompeting static graph approaches and other state-of-the-art adaptive methods.\nWe further show that the assigned attention scores indicate that there are both\nimaging and non-imaging phenotypes that are informative for brain age\nestimation and are in agreement with the relevant literature.\n","authors":["Kyriaki-Margarita Bintsi","Vasileios Baltatzis","Rolandos Alexandros Potamias","Alexander Hammers","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.04639v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.09916v1","updated":"2023-07-19T11:40:15Z","published":"2023-07-19T11:40:15Z","title":"TimeTuner: Diagnosing Time Representations for Time-Series Forecasting\n with Counterfactual Explanations","summary":" Deep learning (DL) approaches are being increasingly used for time-series\nforecasting, with many efforts devoted to designing complex DL models. Recent\nstudies have shown that the DL success is often attributed to effective data\nrepresentations, fostering the fields of feature engineering and representation\nlearning. However, automated approaches for feature learning are typically\nlimited with respect to incorporating prior knowledge, identifying interactions\namong variables, and choosing evaluation metrics to ensure that the models are\nreliable. To improve on these limitations, this paper contributes a novel\nvisual analytics framework, namely TimeTuner, designed to help analysts\nunderstand how model behaviors are associated with localized correlations,\nstationarity, and granularity of time-series representations. The system mainly\nconsists of the following two-stage technique: We first leverage counterfactual\nexplanations to connect the relationships among time-series representations,\nmultivariate features and model predictions. Next, we design multiple\ncoordinated views including a partition-based correlation matrix and juxtaposed\nbivariate stripes, and provide a set of interactions that allow users to step\ninto the transformation selection process, navigate through the feature space,\nand reason the model performance. We instantiate TimeTuner with two\ntransformation methods of smoothing and sampling, and demonstrate its\napplicability on real-world time-series forecasting of univariate sunspots and\nmultivariate air pollutants. Feedback from domain experts indicates that our\nsystem can help characterize time-series representations and guide the feature\nengineering processes.\n","authors":["Jianing Hao","Qing Shi","Yilin Ye","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.09916v1.pdf","comment":"11 pages, 9 figures, this paper has been accepted by VIS2024"},{"id":"http://arxiv.org/abs/2307.09912v1","updated":"2023-07-19T11:32:24Z","published":"2023-07-19T11:32:24Z","title":"Deep projection networks for learning time-homogeneous dynamical systems","summary":" We consider the general class of time-homogeneous dynamical systems, both\ndiscrete and continuous, and study the problem of learning a meaningful\nrepresentation of the state from observed data. This is instrumental for the\ntask of learning a forward transfer operator of the system, that in turn can be\nused for forecasting future states or observables. The representation,\ntypically parametrized via a neural network, is associated with a projection\noperator and is learned by optimizing an objective function akin to that of\ncanonical correlation analysis (CCA). However, unlike CCA, our objective avoids\nmatrix inversions and therefore is generally more stable and applicable to\nchallenging scenarios. Our objective is a tight relaxation of CCA and we\nfurther enhance it by proposing two regularization schemes, one encouraging the\northogonality of the components of the representation while the other\nexploiting Chapman-Kolmogorov's equation. We apply our method to challenging\ndiscrete dynamical systems, discussing improvements over previous methods, as\nwell as to continuous dynamical systems.\n","authors":["Vladimir R. Kostic","Pietro Novelli","Riccardo Grazzi","Karim Lounici","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2307.09912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06698v2","updated":"2023-07-19T11:23:07Z","published":"2023-07-13T11:54:32Z","title":"IntelliGraphs: Datasets for Benchmarking Knowledge Graph Generation","summary":" Knowledge Graph Embedding (KGE) models are used to learn continuous\nrepresentations of entities and relations. A key task in the literature is\npredicting missing links between entities. However, Knowledge Graphs are not\njust sets of links but also have semantics underlying their structure.\nSemantics is crucial in several downstream tasks, such as query answering or\nreasoning. We introduce the subgraph inference task, where a model has to\ngenerate likely and semantically valid subgraphs. We propose IntelliGraphs, a\nset of five new Knowledge Graph datasets. The IntelliGraphs datasets contain\nsubgraphs with semantics expressed in logical rules for evaluating subgraph\ninference. We also present the dataset generator that produced the synthetic\ndatasets. We designed four novel baseline models, which include three models\nbased on traditional KGEs. We evaluate their expressiveness and show that these\nmodels cannot capture the semantics. We believe this benchmark will encourage\nthe development of machine learning models that emphasize semantic\nunderstanding.\n","authors":["Thiviyan Thanapalasingam","Emile van Krieken","Peter Bloem","Paul Groth"],"pdf_url":"https://arxiv.org/pdf/2307.06698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09211v3","updated":"2023-07-19T10:52:30Z","published":"2023-05-16T06:40:04Z","title":"CB-HVTNet: A channel-boosted hybrid vision transformer network for\n lymphocyte assessment in histopathological images","summary":" Transformers, due to their ability to learn long range dependencies, have\novercome the shortcomings of convolutional neural networks (CNNs) for global\nperspective learning. Therefore, they have gained the focus of researchers for\nseveral vision related tasks including medical diagnosis. However, their\nmulti-head attention module only captures global level feature representations,\nwhich is insufficient for medical images. To address this issue, we propose a\nChannel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning\nto generate boosted channels and employs both transformers and CNNs to analyse\nlymphocytes in histopathological images. The proposed CB HVT comprises five\nmodules, including a channel generation module, channel exploitation module,\nchannel merging module, region-aware module, and a detection and segmentation\nhead, which work together to effectively identify lymphocytes. The channel\ngeneration module uses the idea of channel boosting through transfer learning\nto extract diverse channels from different auxiliary learners. In the CB HVT,\nthese boosted channels are first concatenated and ranked using an attention\nmechanism in the channel exploitation module. A fusion block is then utilized\nin the channel merging module for a gradual and systematic merging of the\ndiverse boosted channels to improve the network's learning representations. The\nCB HVT also employs a proposal network in its region aware module and a head to\neffectively identify objects, even in overlapping regions and with artifacts.\nWe evaluated the proposed CB HVT on two publicly available datasets for\nlymphocyte assessment in histopathological images. The results show that CB HVT\noutperformed other state of the art detection models, and has good\ngeneralization ability, demonstrating its value as a tool for pathologists.\n","authors":["Momina Liaqat Ali","Zunaira Rauf","Asifullah Khan","Anabia Sohail","Rafi Ullah","Jeonghwan Gwak"],"pdf_url":"https://arxiv.org/pdf/2305.09211v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09896v1","updated":"2023-07-19T10:50:36Z","published":"2023-07-19T10:50:36Z","title":"Repeated Observations for Classification","summary":" We study the problem nonparametric classification with repeated observations.\nLet $\\bX$ be the $d$ dimensional feature vector and let $Y$ denote the label\ntaking values in $\\{1,\\dots ,M\\}$. In contrast to usual setup with large sample\nsize $n$ and relatively low dimension $d$, this paper deals with the situation,\nwhen instead of observing a single feature vector $\\bX$ we are given $t$\nrepeated feature vectors $\\bV_1,\\dots ,\\bV_t $. Some simple classification\nrules are presented such that the conditional error probabilities have\nexponential convergence rate of convergence as $t\\to\\infty$. In the analysis,\nwe investigate particular models like robust detection by nominal densities,\nprototype classification, linear transformation, linear classification,\nscaling.\n","authors":["Hüseyin Afşer","László Györfi","Harro Walk"],"pdf_url":"https://arxiv.org/pdf/2307.09896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09883v1","updated":"2023-07-19T10:27:34Z","published":"2023-07-19T10:27:34Z","title":"Symmetric Equilibrium Learning of VAEs","summary":" We view variational autoencoders (VAE) as decoder-encoder pairs, which map\ndistributions in the data space to distributions in the latent space and vice\nversa. The standard learning approach for VAEs, i.e. maximisation of the\nevidence lower bound (ELBO), has an obvious asymmetry in that respect.\nMoreover, it requires a closed form a-priori latent distribution. This limits\nthe applicability of VAEs in more complex scenarios, such as general\nsemi-supervised learning and employing complex generative models as priors. We\npropose a Nash equilibrium learning approach that relaxes these restrictions\nand allows learning VAEs in situations where both the data and the latent\ndistributions are accessible only by sampling. The flexibility and simplicity\nof this approach allows its application to a wide range of learning scenarios\nand downstream tasks. We show experimentally that the models learned by this\nmethod are comparable to those obtained by ELBO learning and demonstrate its\napplicability for tasks that are not accessible by standard VAE learning.\n","authors":["Boris Flach","Dmitrij Schlesinger","Alexander Shekhovtsov"],"pdf_url":"https://arxiv.org/pdf/2307.09883v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.09882v1","updated":"2023-07-19T10:26:29Z","published":"2023-07-19T10:26:29Z","title":"Adversarial Likelihood Estimation with One-way Flows","summary":" Generative Adversarial Networks (GANs) can produce high-quality samples, but\ndo not provide an estimate of the probability density around the samples.\nHowever, it has been noted that maximizing the log-likelihood within an\nenergy-based setting can lead to an adversarial framework where the\ndiscriminator provides unnormalized density (often called energy). We further\ndevelop this perspective, incorporate importance sampling, and show that 1)\nWasserstein GAN performs a biased estimate of the partition function, and we\npropose instead to use an unbiased estimator; 2) when optimizing for\nlikelihood, one must maximize generator entropy. This is hypothesized to\nprovide a better mode coverage. Different from previous works, we explicitly\ncompute the density of the generated samples. This is the key enabler to\ndesigning an unbiased estimator of the partition function and computation of\nthe generator entropy term. The generator density is obtained via a new type of\nflow network, called one-way flow network, that is less constrained in terms of\narchitecture, as it does not require to have a tractable inverse function. Our\nexperimental results show that we converge faster, produce comparable sample\nquality to GANs with similar architecture, successfully avoid over-fitting to\ncommonly used datasets and produce smooth low-dimensional latent\nrepresentations of the training data.\n","authors":["Omri Ben-Dov","Pravir Singh Gupta","Victoria Abrevaya","Michael J. Black","Partha Ghosh"],"pdf_url":"https://arxiv.org/pdf/2307.09882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09866v1","updated":"2023-07-19T09:53:56Z","published":"2023-07-19T09:53:56Z","title":"Detecting Vulnerable Nodes in Urban Infrastructure Interdependent\n Network","summary":" Understanding and characterizing the vulnerability of urban infrastructures,\nwhich refers to the engineering facilities essential for the regular running of\ncities and that exist naturally in the form of networks, is of great value to\nus. Potential applications include protecting fragile facilities and designing\nrobust topologies, etc. Due to the strong correlation between different\ntopological characteristics and infrastructure vulnerability and their\ncomplicated evolution mechanisms, some heuristic and machine-assisted analysis\nfall short in addressing such a scenario. In this paper, we model the\ninterdependent network as a heterogeneous graph and propose a system based on\ngraph neural network with reinforcement learning, which can be trained on\nreal-world data, to characterize the vulnerability of the city system\naccurately. The presented system leverages deep learning techniques to\nunderstand and analyze the heterogeneous graph, which enables us to capture the\nrisk of cascade failure and discover vulnerable infrastructures of cities.\nExtensive experiments with various requests demonstrate not only the expressive\npower of our system but also transferring ability and necessity of the specific\ncomponents.\n","authors":["Jinzhu Mao","Liu Cao","Chen Gao","Huandong Wang","Hangyu Fan","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2307.09866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09862v1","updated":"2023-07-19T09:45:41Z","published":"2023-07-19T09:45:41Z","title":"Towards a population-informed approach to the definition of data-driven\n models for structural dynamics","summary":" Machine learning has affected the way in which many phenomena for various\ndomains are modelled, one of these domains being that of structural dynamics.\nHowever, because machine-learning algorithms are problem-specific, they often\nfail to perform efficiently in cases of data scarcity. To deal with such\nissues, combination of physics-based approaches and machine learning algorithms\nhave been developed. Although such methods are effective, they also require the\nanalyser's understanding of the underlying physics of the problem. The current\nwork is aimed at motivating the use of models which learn such relationships\nfrom a population of phenomena, whose underlying physics are similar. The\ndevelopment of such models is motivated by the way that physics-based models,\nand more specifically finite element models, work. Such models are considered\ntransferrable, explainable and trustworthy, attributes which are not trivially\nimposed or achieved for machine-learning models. For this reason,\nmachine-learning approaches are less trusted by industry and often considered\nmore difficult to form validated models. To achieve such data-driven models, a\npopulation-based scheme is followed here and two different machine-learning\nalgorithms from the meta-learning domain are used. The two algorithms are the\nmodel-agnostic meta-learning (MAML) algorithm and the conditional neural\nprocesses (CNP) model. The algorithms seem to perform as intended and\noutperform a traditional machine-learning algorithm at approximating the\nquantities of interest. Moreover, they exhibit behaviour similar to traditional\nmachine learning algorithms (e.g. neural networks or Gaussian processes),\nconcerning their performance as a function of the available structures in the\ntraining population.\n","authors":["G. Tsialiamanis","N. Dervilis","D. J. Wagg","K. Worden"],"pdf_url":"https://arxiv.org/pdf/2307.09862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v2","updated":"2023-07-19T09:23:43Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n Transferability From Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v2.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n pages, 12 figures, 13 tables"},{"id":"http://arxiv.org/abs/2307.09458v2","updated":"2023-07-19T09:22:02Z","published":"2023-07-18T17:39:04Z","title":"Does Circuit Analysis Interpretability Scale? Evidence from Multiple\n Choice Capabilities in Chinchilla","summary":" \\emph{Circuit analysis} is a promising technique for understanding the\ninternal mechanisms of language models. However, existing analyses are done in\nsmall models far from the state of the art. To address this, we present a case\nstudy of circuit analysis in the 70B Chinchilla model, aiming to test the\nscalability of circuit analysis. In particular, we study multiple-choice\nquestion answering, and investigate Chinchilla's capability to identify the\ncorrect answer \\emph{label} given knowledge of the correct answer \\emph{text}.\nWe find that the existing techniques of logit attribution, attention pattern\nvisualization, and activation patching naturally scale to Chinchilla, allowing\nus to identify and categorize a small set of `output nodes' (attention heads\nand MLPs).\n We further study the `correct letter' category of attention heads aiming to\nunderstand the semantics of their features, with mixed results. For normal\nmultiple-choice question answers, we significantly compress the query, key and\nvalue subspaces of the head without loss of performance when operating on the\nanswer labels for multiple-choice questions, and we show that the query and key\nsubspaces represent an `Nth item in an enumeration' feature to at least some\nextent. However, when we attempt to use this explanation to understand the\nheads' behaviour on a more general distribution including randomized answer\nlabels, we find that it is only a partial explanation, suggesting there is more\nto learn about the operation of `correct letter' heads on multiple choice\nquestion answering.\n","authors":["Tom Lieberum","Matthew Rahtz","János Kramár","Neel Nanda","Geoffrey Irving","Rohin Shah","Vladimir Mikulik"],"pdf_url":"https://arxiv.org/pdf/2307.09458v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10404v4","updated":"2023-07-19T09:17:09Z","published":"2023-06-17T18:16:51Z","title":"The RL Perceptron: Generalisation Dynamics of Policy Learning in High\n Dimensions","summary":" Reinforcement learning (RL) algorithms have proven transformative in a range\nof domains. To tackle real-world domains, these systems often use neural\nnetworks to learn policies directly from pixels or other high-dimensional\nsensory input. By contrast, much theory of RL has focused on discrete state\nspaces or worst-case analysis, and fundamental questions remain about the\ndynamics of policy learning in high-dimensional settings. Here, we propose a\nsolvable high-dimensional model of RL that can capture a variety of learning\nprotocols, and derive its typical dynamics as a set of closed-form ordinary\ndifferential equations (ODEs). We derive optimal schedules for the learning\nrates and task difficulty - analogous to annealing schemes and curricula during\ntraining in RL - and show that the model exhibits rich behaviour, including\ndelayed learning under sparse rewards; a variety of learning regimes depending\non reward baselines; and a speed-accuracy trade-off driven by reward\nstringency. Experiments on variants of the Procgen game \"Bossfight\" and Arcade\nLearning Environment game \"Pong\" also show such a speed-accuracy trade-off in\npractice. Together, these results take a step towards closing the gap between\ntheory and practice in high-dimensional RL.\n","authors":["Nishil Patel","Sebastian Lee","Stefano Sarao Mannelli","Sebastian Goldt","Adrew Saxe"],"pdf_url":"https://arxiv.org/pdf/2306.10404v4.pdf","comment":"10 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2305.07898v2","updated":"2023-07-19T09:15:20Z","published":"2023-05-13T11:42:40Z","title":"Network-GIANT: Fully distributed Newton-type optimization via harmonic\n Hessian consensus","summary":" This paper considers the problem of distributed multi-agent learning, where\nthe global aim is to minimize a sum of local objective (empirical loss)\nfunctions through local optimization and information exchange between\nneighbouring nodes. We introduce a Newton-type fully distributed optimization\nalgorithm, Network-GIANT, which is based on GIANT, a Federated learning\nalgorithm that relies on a centralized parameter server. The Network-GIANT\nalgorithm is designed via a combination of gradient-tracking and a Newton-type\niterative algorithm at each node with consensus based averaging of local\ngradient and Newton updates. We prove that our algorithm guarantees semi-global\nand exponential convergence to the exact solution over the network assuming\nstrongly convex and smooth loss functions. We provide empirical evidence of the\nsuperior convergence performance of Network-GIANT over other state-of-art\ndistributed learning algorithms such as Network-DANE and Newton-Raphson\nConsensus.\n","authors":["Alessio Maritan","Ganesh Sharma","Luca Schenato","Subhrakanti Dey"],"pdf_url":"https://arxiv.org/pdf/2305.07898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09844v1","updated":"2023-07-19T09:03:41Z","published":"2023-07-19T09:03:41Z","title":"Reinforcement Learning for Credit Index Option Hedging","summary":" In this paper, we focus on finding the optimal hedging strategy of a credit\nindex option using reinforcement learning. We take a practical approach, where\nthe focus is on realism i.e. discrete time, transaction costs; even testing our\npolicy on real market data. We apply a state of the art algorithm, the Trust\nRegion Volatility Optimization (TRVO) algorithm and show that the derived\nhedging strategy outperforms the practitioner's Black & Scholes delta hedge.\n","authors":["Francesco Mandelli","Marco Pinciroli","Michele Trapletti","Edoardo Vittori"],"pdf_url":"https://arxiv.org/pdf/2307.09844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09340v2","updated":"2023-07-19T08:55:01Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":" Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v2.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.09836v1","updated":"2023-07-19T08:47:41Z","published":"2023-07-19T08:47:41Z","title":"Near-Linear Time Projection onto the $\\ell_{1,\\infty}$ Ball; Application\n to Sparse Autoencoders","summary":" Looking for sparsity is nowadays crucial to speed up the training of\nlarge-scale neural networks. Projections onto the $\\ell_{1,2}$ and\n$\\ell_{1,\\infty}$ are among the most efficient techniques to sparsify and\nreduce the overall cost of neural networks. In this paper, we introduce a new\nprojection algorithm for the $\\ell_{1,\\infty}$ norm ball. The worst-case time\ncomplexity of this algorithm is $\\mathcal{O}\\big(nm+J\\log(nm)\\big)$ for a\nmatrix in $\\mathbb{R}^{n\\times m}$. $J$ is a term that tends to 0 when the\nsparsity is high, and to $nm$ when the sparsity is low. Its implementation is\neasy and it is guaranteed to converge to the exact solution in a finite time.\nMoreover, we propose to incorporate the $\\ell_{1,\\infty}$ ball projection while\ntraining an autoencoder to enforce feature selection and sparsity of the\nweights. Sparsification appears in the encoder to primarily do feature\nselection due to our application in biology, where only a very small part\n($<2\\%$) of the data is relevant. We show that both in the biological case and\nin the general case of sparsity that our method is the fastest.\n","authors":["Guillaume Perez","Laurent Condat","Michel Barlaud"],"pdf_url":"https://arxiv.org/pdf/2307.09836v1.pdf","comment":"22 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.09835v1","updated":"2023-07-19T08:46:47Z","published":"2023-07-19T08:46:47Z","title":"Deep Operator Network Approximation Rates for Lipschitz Operators","summary":" We establish universality and expression rate bounds for a class of neural\nDeep Operator Networks (DON) emulating Lipschitz (or H\\\"older) continuous maps\n$\\mathcal G:\\mathcal X\\to\\mathcal Y$ between (subsets of) separable Hilbert\nspaces $\\mathcal X$, $\\mathcal Y$. The DON architecture considered uses linear\nencoders $\\mathcal E$ and decoders $\\mathcal D$ via (biorthogonal) Riesz bases\nof $\\mathcal X$, $\\mathcal Y$, and an approximator network of an\ninfinite-dimensional, parametric coordinate map that is Lipschitz continuous on\nthe sequence space $\\ell^2(\\mathbb N)$. Unlike previous works ([Herrmann,\nSchwab and Zech: Neural and Spectral operator surrogates: construction and\nexpression rate bounds, SAM Report, 2022], [Marcati and Schwab: Exponential\nConvergence of Deep Operator Networks for Elliptic Partial Differential\nEquations, SAM Report, 2022]), which required for example $\\mathcal G$ to be\nholomorphic, the present expression rate results require mere Lipschitz (or\nH\\\"older) continuity of $\\mathcal G$. Key in the proof of the present\nexpression rate bounds is the use of either super-expressive activations (e.g.\n[Yarotski: Elementary superexpressive activations, Int. Conf. on ML, 2021],\n[Shen, Yang and Zhang: Neural network approximation: Three hidden layers are\nenough, Neural Networks, 2021], and the references there) which are inspired by\nthe Kolmogorov superposition theorem, or of nonstandard NN architectures with\nstandard (ReLU) activations as recently proposed in [Zhang, Shen and Yang:\nNeural Network Architecture Beyond Width and Depth, Adv. in Neural Inf. Proc.\nSys., 2022]. We illustrate the abstract results by approximation rate bounds\nfor emulation of a) solution operators for parametric elliptic variational\ninequalities, and b) Lipschitz maps of Hilbert-Schmidt operators.\n","authors":["Christoph Schwab","Andreas Stein","Jakob Zech"],"pdf_url":"https://arxiv.org/pdf/2307.09835v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2307.09829v1","updated":"2023-07-19T08:34:25Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n shortcut perspective","summary":" Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2307.09823v1","updated":"2023-07-19T08:21:01Z","published":"2023-07-19T08:21:01Z","title":"Multi-modal Learning based Prediction for Disease","summary":" Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic\nliver disease, which can be predicted accurately to prevent advanced fibrosis\nand cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is\ninvasive, expensive, and prone to sampling errors. Therefore, non-invasive\nstudies are extremely promising, yet they are still in their infancy due to the\nlack of comprehensive research data and intelligent methods for multi-modal\ndata. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a\ncomprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD\nprediction method (DeepFLD). The dataset includes over 6000 participants\nphysical examinations, laboratory and imaging studies, extensive\nquestionnaires, and facial images of partial participants, which is\ncomprehensive and valuable for clinical studies. From the dataset, we\nquantitatively analyze and select clinical metadata that most contribute to\nNAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network\nmodel designed to predict NAFLD using multi-modal input, including metadata and\nfacial images, outperforms the approach that only uses metadata. Satisfactory\nperformance is also verified on other unseen datasets. Inspiringly, DeepFLD can\nachieve competitive results using only facial images as input rather than\nmetadata, paving the way for a more robust and simpler non-invasive NAFLD\ndiagnosis.\n","authors":["Yaran Chen","Xueyu Chen","Yu Han","Haoran Li","Dongbin Zhao","Jingzhong Li","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09818v1","updated":"2023-07-19T08:06:37Z","published":"2023-07-19T08:06:37Z","title":"Deep unrolling Shrinkage Network for Dynamic MR imaging","summary":" Deep unrolling networks that utilize sparsity priors have achieved great\nsuccess in dynamic magnetic resonance (MR) imaging. The convolutional neural\nnetwork (CNN) is usually utilized to extract the transformed domain, and then\nthe soft thresholding (ST) operator is applied to the CNN-transformed data to\nenforce the sparsity priors. However, the ST operator is usually constrained to\nbe the same across all channels of the CNN-transformed data. In this paper, we\npropose a novel operator, called soft thresholding with channel attention\n(AST), that learns the threshold for each channel. In particular, we put\nforward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the\nalternating direction method of multipliers (ADMM) for optimizing the\ntransformed $l_1$ norm dynamic MR reconstruction model. Experimental results on\nan open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net\noutperforms the state-of-the-art methods. The source code is available at\n\\url{https://github.com/yhao-z/DUS-Net}.\n","authors":["Yinghao Zhang","Xiaodi Li","Weihang Li","Yue Hu"],"pdf_url":"https://arxiv.org/pdf/2307.09818v1.pdf","comment":"5 pages,3 figures,2 tables"},{"id":"http://arxiv.org/abs/2307.09816v1","updated":"2023-07-19T08:05:46Z","published":"2023-07-19T08:05:46Z","title":"Manifold Learning with Sparse Regularised Optimal Transport","summary":" Manifold learning is a central task in modern statistics and data science.\nMany datasets (cells, documents, images, molecules) can be represented as point\nclouds embedded in a high dimensional ambient space, however the degrees of\nfreedom intrinsic to the data are usually far fewer than the number of ambient\ndimensions. The task of detecting a latent manifold along which the data are\nembedded is a prerequisite for a wide family of downstream analyses. Real-world\ndatasets are subject to noisy observations and sampling, so that distilling\ninformation about the underlying manifold is a major challenge. We propose a\nmethod for manifold learning that utilises a symmetric version of optimal\ntransport with a quadratic regularisation that constructs a sparse and adaptive\naffinity matrix, that can be interpreted as a generalisation of the\nbistochastic kernel normalisation. We prove that the resulting kernel is\nconsistent with a Laplace-type operator in the continuous limit, establish\nrobustness to heteroskedastic noise and exhibit these results in simulations.\nWe identify a highly efficient computational scheme for computing this optimal\ntransport for discrete data and demonstrate that it outperforms competing\nmethods in a set of examples.\n","authors":["Stephen Zhang","Gilles Mordant","Tetsuya Matsumoto","Geoffrey Schiebinger"],"pdf_url":"https://arxiv.org/pdf/2307.09816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09810v1","updated":"2023-07-19T07:58:21Z","published":"2023-07-19T07:58:21Z","title":"GenKL: An Iterative Framework for Resolving Label Ambiguity and Label\n Non-conformity in Web Images Via a New Generalized KL Divergence","summary":" Web image datasets curated online inherently contain ambiguous\nin-distribution (ID) instances and out-of-distribution (OOD) instances, which\nwe collectively call non-conforming (NC) instances. In many recent approaches\nfor mitigating the negative effects of NC instances, the core implicit\nassumption is that the NC instances can be found via entropy maximization. For\n\"entropy\" to be well-defined, we are interpreting the output prediction vector\nof an instance as the parameter vector of a multinomial random variable, with\nrespect to some trained model with a softmax output layer. Hence, entropy\nmaximization is based on the idealized assumption that NC instances have\npredictions that are \"almost\" uniformly distributed. However, in real-world web\nimage datasets, there are numerous NC instances whose predictions are far from\nbeing uniformly distributed. To tackle the limitation of entropy maximization,\nwe propose $(\\alpha, \\beta)$-generalized KL divergence,\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$, which can be used to identify\nsignificantly more NC instances. Theoretical properties of\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$ are proven, and we also show\nempirically that a simple use of $\\mathcal{D}_{\\text{KL}}^{\\alpha,\n\\beta}(p\\|q)$ outperforms all baselines on the NC instance identification task.\nBuilding upon $(\\alpha,\\beta)$-generalized KL divergence, we also introduce a\nnew iterative training framework, GenKL, that identifies and relabels NC\ninstances. When evaluated on three web image datasets, Clothing1M,\nFood101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art\nclassification accuracies: $81.34\\%$, $85.73\\%$ and $78.99\\%$/$92.54\\%$\n(top-1/top-5), respectively.\n","authors":["Xia Huang","Kai Fong Ernest Chong"],"pdf_url":"https://arxiv.org/pdf/2307.09810v1.pdf","comment":"Published (with open access) at International Journal of Computer\n Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at:\n https://github.com/codetopaper/GenKL"},{"id":"http://arxiv.org/abs/2307.09801v1","updated":"2023-07-19T07:40:51Z","published":"2023-07-19T07:40:51Z","title":"Graph Federated Learning Based on the Decentralized Framework","summary":" Graph learning has a wide range of applications in many scenarios, which\nrequire more need for data privacy. Federated learning is an emerging\ndistributed machine learning approach that leverages data from individual\ndevices or data centers to improve the accuracy and generalization of the\nmodel, while also protecting the privacy of user data. Graph-federated learning\nis mainly based on the classical federated learning framework i.e., the\nClient-Server framework. However, the Client-Server framework faces problems\nsuch as a single point of failure of the central server and poor scalability of\nnetwork topology. First, we introduce the decentralized framework to\ngraph-federated learning. Second, determine the confidence among nodes based on\nthe similarity of data among nodes, subsequently, the gradient information is\nthen aggregated by linear weighting based on confidence. Finally, the proposed\nmethod is compared with FedAvg, Fedprox, GCFL, and GCFL+ to verify the\neffectiveness of the proposed method. Experiments demonstrate that the proposed\nmethod outperforms other methods.\n","authors":["Peilin Liu","Yanni Tang","Mingyue Zhang","Wu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09801v1.pdf","comment":"12 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.09797v1","updated":"2023-07-19T07:31:37Z","published":"2023-07-19T07:31:37Z","title":"Probabilistic Forecasting with Coherent Aggregation","summary":" Obtaining accurate probabilistic forecasts while respecting hierarchical\ninformation is an important operational challenge in many applications, perhaps\nmost obviously in energy management, supply chain planning, and resource\nallocation. The basic challenge, especially for multivariate forecasting, is\nthat forecasts are often required to be coherent with respect to the\nhierarchical structure. In this paper, we propose a new model which leverages a\nfactor model structure to produce coherent forecasts by construction. This is a\nconsequence of a simple (exchangeability) observation: permuting\n\\textit{}base-level series in the hierarchy does not change their aggregates.\nOur model uses a convolutional neural network to produce parameters for the\nfactors, their loadings and base-level distributions; it produces samples which\ncan be differentiated with respect to the model's parameters; and it can\ntherefore optimize for any sample-based loss function, including the Continuous\nRanked Probability Score and quantile losses. We can choose arbitrary\ncontinuous distributions for the factor and the base-level distributions. We\ncompare our method to two previous methods which can be optimized end-to-end,\nwhile enforcing coherent aggregation. Our model achieves significant\nimprovements: between $11.8-41.4\\%$ on three hierarchical forecasting datasets.\nWe also analyze the influence of parameters in our model with respect to\nbase-level distribution and number of factors.\n","authors":["Geoffrey Négiar","Ruijun Ma","O. Nangba Meetei","Mengfei Cao","Michael W. Mahoney"],"pdf_url":"https://arxiv.org/pdf/2307.09797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05086v3","updated":"2023-07-19T07:31:35Z","published":"2023-02-10T07:08:13Z","title":"Making Substitute Models More Bayesian Can Enhance Transferability of\n Adversarial Examples","summary":" The transferability of adversarial examples across deep neural networks\n(DNNs) is the crux of many black-box attacks. Many prior efforts have been\ndevoted to improving the transferability via increasing the diversity in inputs\nof some substitute models. In this paper, by contrast, we opt for the diversity\nin substitute models and advocate to attack a Bayesian model for achieving\ndesirable transferability. Deriving from the Bayesian formulation, we develop a\nprincipled strategy for possible finetuning, which can be combined with many\noff-the-shelf Gaussian posterior approximations over DNN parameters. Extensive\nexperiments have been conducted to verify the effectiveness of our method, on\ncommon benchmark datasets, and the results demonstrate that our method\noutperforms recent state-of-the-arts by large margins (roughly 19% absolute\nincrease in average attack success rate on ImageNet), and, by combining with\nthese recent methods, further performance gain can be obtained. Our code:\nhttps://github.com/qizhangli/MoreBayesian-attack.\n","authors":["Qizhang Li","Yiwen Guo","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2302.05086v3.pdf","comment":"Accepted by ICLR 2023, fix typos"},{"id":"http://arxiv.org/abs/2307.09796v1","updated":"2023-07-19T07:30:01Z","published":"2023-07-19T07:30:01Z","title":"Forecasting Early with Meta Learning","summary":" In the early observation period of a time series, there might be only a few\nhistoric observations available to learn a model. However, in cases where an\nexisting prior set of datasets is available, Meta learning methods can be\napplicable. In this paper, we devise a Meta learning method that exploits\nsamples from additional datasets and learns to augment time series through\nadversarial learning as an auxiliary task for the target dataset. Our model\n(FEML), is equipped with a shared Convolutional backbone that learns features\nfor varying length inputs from different datasets and has dataset specific\nheads to forecast for different output lengths. We show that FEML can meta\nlearn across datasets and by additionally learning on adversarial generated\nsamples as auxiliary samples for the target dataset, it can improve the\nforecasting performance compared to single task learning, and various solutions\nadapted from Joint learning, Multi-task learning and classic forecasting\nbaselines.\n","authors":["Shayan Jawed","Kiran Madhusudhanan","Vijaya Krishna Yalavarthi","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2307.09796v1.pdf","comment":"IJCNN 2023"},{"id":"http://arxiv.org/abs/2307.09795v1","updated":"2023-07-19T07:29:14Z","published":"2023-07-19T07:29:14Z","title":"From West to East: Who can understand the music of the others better?","summary":" Recent developments in MIR have led to several benchmark deep learning models\nwhose embeddings can be used for a variety of downstream tasks. At the same\ntime, the vast majority of these models have been trained on Western pop/rock\nmusic and related styles. This leads to research questions on whether these\nmodels can be used to learn representations for different music cultures and\nstyles, or whether we can build similar music audio embedding models trained on\ndata from different cultures or styles. To that end, we leverage transfer\nlearning methods to derive insights about the similarities between the\ndifferent music cultures to which the data belongs to. We use two Western music\ndatasets, two traditional/folk datasets coming from eastern Mediterranean\ncultures, and two datasets belonging to Indian art music. Three deep audio\nembedding models are trained and transferred across domains, including two\nCNN-based and a Transformer-based architecture, to perform auto-tagging for\neach target domain dataset. Experimental results show that competitive\nperformance is achieved in all domains via transfer learning, while the best\nsource dataset varies for each music culture. The implementation and the\ntrained models are both provided in a public repository.\n","authors":["Charilaos Papaioannou","Emmanouil Benetos","Alexandros Potamianos"],"pdf_url":"https://arxiv.org/pdf/2307.09795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09792v1","updated":"2023-07-19T07:17:06Z","published":"2023-07-19T07:17:06Z","title":"A Note on Hardness of Computing Recursive Teaching Dimension","summary":" In this short note, we show that the problem of computing the recursive\nteaching dimension (RTD) for a concept class (given explicitly as input)\nrequires $n^{\\Omega(\\log n)}$-time, assuming the exponential time hypothesis\n(ETH). This matches the running time $n^{O(\\log n)}$ of the brute-force\nalgorithm for the problem.\n","authors":["Pasin Manurangsi"],"pdf_url":"https://arxiv.org/pdf/2307.09792v1.pdf","comment":"To appear in IPL"},{"id":"http://arxiv.org/abs/2307.09782v1","updated":"2023-07-19T06:58:03Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n Using Floating-Point Formats","summary":" In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09781v1","updated":"2023-07-19T06:56:07Z","published":"2023-07-19T06:56:07Z","title":"Text2Layer: Layered Image Generation using Latent Diffusion Model","summary":" Layer compositing is one of the most popular image editing workflows among\nboth amateurs and professionals. Motivated by the success of diffusion models,\nwe explore layer compositing from a layered image generation perspective.\nInstead of generating an image, we propose to generate background, foreground,\nlayer mask, and the composed image simultaneously. To achieve layered image\ngeneration, we train an autoencoder that is able to reconstruct layered images\nand train diffusion models on the latent representation. One benefit of the\nproposed problem is to enable better compositing workflows in addition to the\nhigh-quality image output. Another benefit is producing higher-quality layer\nmasks compared to masks produced by a separate step of image segmentation.\nExperimental results show that the proposed method is able to generate\nhigh-quality layered images and initiates a benchmark for future work.\n","authors":["Xinyang Zhang","Wentian Zhao","Xin Lu","Jeff Chien"],"pdf_url":"https://arxiv.org/pdf/2307.09781v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2212.01692v4","updated":"2023-07-19T06:48:35Z","published":"2022-12-03T21:14:32Z","title":"Can In-context Learners Learn a Reasoning Concept from Demonstrations?","summary":" Language models exhibit an emergent ability to learn a new task from a small\nnumber of input-output demonstrations. However, recent work shows that\nin-context learners largely rely on their pre-trained knowledge, such as the\nsentiment of the labels, instead of learning new associations from the input.\nWe argue that the commonly-used few-shot evaluation using a random selection of\nin-context demonstrations can not disentangle models' reliance on such biases,\nas most of the randomly-selected demonstrations do not present relations\ninformative for prediction beyond exposing the task's input-output\ndistribution.\n Therefore, to evaluate models' in-context learning ability independent of\nmodels' memory, we introduce a Concept-sharing few-shot learning method\nchoosing the demonstrations that share an underlying concept with the predicted\nsample. We extract a set of such concepts from available human explanations and\nmeasure how much models can benefit from presenting these concepts in few-shot\ndemonstrations.\n We find that most of the recent in-context learners can not consistently\nbenefit from the demonstrated concepts, irrespective of the model size.\nHowever, we note that T0 models are more sensitive to exhibited concepts,\nbenefiting from concept-sharing demonstrations in 7 out of 8 evaluation\nscenarios.\n","authors":["Michal Štefánik","Marek Kadlčík"],"pdf_url":"https://arxiv.org/pdf/2212.01692v4.pdf","comment":"Awarded Best Paper at ACL 2023 Natural Language Reasoning and\n Structured Explanations (NLRSE) workshop"},{"id":"http://arxiv.org/abs/2307.09779v1","updated":"2023-07-19T06:48:33Z","published":"2023-07-19T06:48:33Z","title":"Beyond Single-Feature Importance with ICECREAM","summary":" Which set of features was responsible for a certain output of a machine\nlearning model? Which components caused the failure of a cloud computing\napplication? These are just two examples of questions we are addressing in this\nwork by Identifying Coalition-based Explanations for Common and Rare Events in\nAny Model (ICECREAM). Specifically, we propose an information-theoretic\nquantitative measure for the influence of a coalition of variables on the\ndistribution of a target variable. This allows us to identify which set of\nfactors is essential to obtain a certain outcome, as opposed to\nwell-established explainability and causal contribution analysis methods which\ncan assign contributions only to individual factors and rank them by their\nimportance. In experiments with synthetic and real-world data, we show that\nICECREAM outperforms state-of-the-art methods for explainability and root cause\nanalysis, and achieves impressive accuracy in both tasks.\n","authors":["Michael Oesterle","Patrick Blöbaum","Atalanti A. Mastakouri","Elke Kirschbaum"],"pdf_url":"https://arxiv.org/pdf/2307.09779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03638v4","updated":"2023-07-19T06:43:10Z","published":"2022-06-08T01:50:08Z","title":"Alternately Optimized Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have greatly advanced the semi-supervised node\nclassification task on graphs. The majority of existing GNNs are trained in an\nend-to-end manner that can be viewed as tackling a bi-level optimization\nproblem. This process is often inefficient in computation and memory usage. In\nthis work, we propose a new optimization framework for semi-supervised learning\non graphs. The proposed framework can be conveniently solved by the alternating\noptimization algorithms, resulting in significantly improved efficiency.\nExtensive experiments demonstrate that the proposed method can achieve\ncomparable or better performance with state-of-the-art baselines while it has\nsignificantly better computation and memory efficiency.\n","authors":["Haoyu Han","Xiaorui Liu","Haitao Mao","MohamadAli Torkamani","Feng Shi","Victor Lee","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2206.03638v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09771v1","updated":"2023-07-19T06:17:16Z","published":"2023-07-19T06:17:16Z","title":"A Novel Spatial-Temporal Variational Quantum Circuit to Enable Deep\n Learning on NISQ Devices","summary":" Quantum computing presents a promising approach for machine learning with its\ncapability for extremely parallel computation in high-dimension through\nsuperposition and entanglement. Despite its potential, existing quantum\nlearning algorithms, such as Variational Quantum Circuits(VQCs), face\nchallenges in handling more complex datasets, particularly those that are not\nlinearly separable. What's more, it encounters the deployability issue, making\nthe learning models suffer a drastic accuracy drop after deploying them to the\nactual quantum devices. To overcome these limitations, this paper proposes a\nnovel spatial-temporal design, namely ST-VQC, to integrate non-linearity in\nquantum learning and improve the robustness of the learning model to noise.\nSpecifically, ST-VQC can extract spatial features via a novel block-based\nencoding quantum sub-circuit coupled with a layer-wise computation quantum\nsub-circuit to enable temporal-wise deep learning. Additionally, a SWAP-Free\nphysical circuit design is devised to improve robustness. These designs bring a\nnumber of hyperparameters. After a systematic analysis of the design space for\neach design component, an automated optimization framework is proposed to\ngenerate the ST-VQC quantum circuit. The proposed ST-VQC has been evaluated on\ntwo IBM quantum processors, ibm_cairo with 27 qubits and ibmq_lima with 7\nqubits to assess its effectiveness. The results of the evaluation on the\nstandard dataset for binary classification show that ST-VQC can achieve over\n30% accuracy improvement compared with existing VQCs on actual quantum\ncomputers. Moreover, on a non-linear synthetic dataset, the ST-VQC outperforms\na linear classifier by 27.9%, while the linear classifier using classical\ncomputing outperforms the existing VQC by 15.58%.\n","authors":["Jinyang Li","Zhepeng Wang","Zhirui Hu","Prasanna Date","Ang Li","Weiwen Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.09771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11434v2","updated":"2023-07-19T06:17:10Z","published":"2022-03-22T03:13:39Z","title":"Non-linear Embeddings in Hilbert Simplex Geometry","summary":" A key technique of machine learning and computer vision is to embed discrete\nweighted graphs into continuous spaces for further downstream processing.\nEmbedding discrete hierarchical structures in hyperbolic geometry has proven\nvery successful since it was shown that any weighted tree can be embedded in\nthat geometry with arbitrary low distortion. Various optimization methods for\nhyperbolic embeddings based on common models of hyperbolic geometry have been\nstudied. In this paper, we consider Hilbert geometry for the standard simplex\nwhich is isometric to a vector space equipped with the variation polytope norm.\nWe study the representation power of this Hilbert simplex geometry by embedding\ndistance matrices of graphs. Our findings demonstrate that Hilbert simplex\ngeometry is competitive to alternative geometries such as the Poincar\\'e\nhyperbolic ball or the Euclidean geometry for embedding tasks while being fast\nand numerically robust.\n","authors":["Frank Nielsen","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2203.11434v2.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.09768v1","updated":"2023-07-19T06:05:33Z","published":"2023-07-19T06:05:33Z","title":"How Curvature Enhance the Adaptation Power of Framelet GCNs","summary":" Graph neural network (GNN) has been demonstrated powerful in modeling\ngraph-structured data. However, despite many successful cases of applying GNNs\nto various graph classification and prediction tasks, whether the graph\ngeometrical information has been fully exploited to enhance the learning\nperformance of GNNs is not yet well understood. This paper introduces a new\napproach to enhance GNN by discrete graph Ricci curvature. Specifically, the\ngraph Ricci curvature defined on the edges of a graph measures how difficult\nthe information transits on one edge from one node to another based on their\nneighborhoods. Motivated by the geometric analogy of Ricci curvature in the\ngraph setting, we prove that by inserting the curvature information with\ndifferent carefully designed transformation function $\\zeta$, several known\ncomputational issues in GNN such as over-smoothing can be alleviated in our\nproposed model. Furthermore, we verified that edges with very positive Ricci\ncurvature (i.e., $\\kappa_{i,j} \\approx 1$) are preferred to be dropped to\nenhance model's adaption to heterophily graph and one curvature based graph\nedge drop algorithm is proposed. Comprehensive experiments show that our\ncurvature-based GNN model outperforms the state-of-the-art baselines in both\nhomophily and heterophily graph datasets, indicating the effectiveness of\ninvolving graph geometric information in GNNs.\n","authors":["Dai Shi","Yi Guo","Zhiqi Shao","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2307.09768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14048v2","updated":"2023-07-19T06:02:38Z","published":"2023-06-24T20:11:14Z","title":"H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large\n Language Models","summary":" Large Language Models (LLMs), despite their recent impressive\naccomplishments, are notably cost-prohibitive to deploy, particularly for\napplications involving long-content generation, such as dialogue systems and\nstory writing. Often, a large amount of transient state information, referred\nto as the KV cache, is stored in GPU memory in addition to model parameters,\nscaling linearly with the sequence length and batch size. In this paper, we\nintroduce a novel approach for implementing the KV cache which significantly\nreduces its memory footprint. Our approach is based on the noteworthy\nobservation that a small portion of tokens contributes most of the value when\ncomputing attention scores. We call these tokens Heavy Hitters (H$_2$). Through\na comprehensive investigation, we find that (i) the emergence of H$_2$ is\nnatural and strongly correlates with the frequent co-occurrence of tokens in\nthe text, and (ii) removing them results in significant performance\ndegradation. Based on these insights, we propose Heavy Hitter Oracle (H$_2$O),\na KV cache eviction policy that dynamically retains a balance of recent and\nH$_2$ tokens. We formulate the KV cache eviction as a dynamic submodular\nproblem and prove (under mild assumptions) a theoretical guarantee for our\nnovel eviction algorithm which could help guide future work. We validate the\naccuracy of our algorithm with OPT, LLaMA, and GPT-NeoX across a wide range of\ntasks. Our implementation of H$_2$O with 20% heavy hitters improves the\nthroughput over three leading inference systems DeepSpeed Zero-Inference,\nHugging Face Accelerate, and FlexGen by up to 29$\\times$, 29$\\times$, and\n3$\\times$ on OPT-6.7B and OPT-30B. With the same batch size, H2O can reduce the\nlatency by up to 1.9$\\times$. The code is available at\nhttps://github.com/FMInference/H2O.\n","authors":["Zhenyu Zhang","Ying Sheng","Tianyi Zhou","Tianlong Chen","Lianmin Zheng","Ruisi Cai","Zhao Song","Yuandong Tian","Christopher Ré","Clark Barrett","Zhangyang Wang","Beidi Chen"],"pdf_url":"https://arxiv.org/pdf/2306.14048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09767v1","updated":"2023-07-19T05:58:21Z","published":"2023-07-19T05:58:21Z","title":"Sig-Splines: universal approximation and convex calibration of time\n series generative models","summary":" We propose a novel generative model for multivariate discrete-time time\nseries data. Drawing inspiration from the construction of neural spline flows,\nour algorithm incorporates linear transformations and the signature transform\nas a seamless substitution for traditional neural networks. This approach\nenables us to achieve not only the universality property inherent in neural\nnetworks but also introduces convexity in the model's parameters.\n","authors":["Magnus Wiese","Phillip Murray","Ralf Korn"],"pdf_url":"https://arxiv.org/pdf/2307.09767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08621v2","updated":"2023-07-19T05:56:42Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":" In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02918v3","updated":"2023-07-19T05:51:00Z","published":"2023-03-06T06:28:20Z","title":"Graph Positional Encoding via Random Feature Propagation","summary":" Two main families of node feature augmentation schemes have been explored for\nenhancing GNNs: random features and spectral positional encoding. Surprisingly,\nhowever, there is still no clear understanding of the relation between these\ntwo augmentation schemes. Here we propose a novel family of positional encoding\nschemes which draws a link between the above two approaches and improves over\nboth. The new approach, named Random Feature Propagation (RFP), is inspired by\nthe power iteration method and its generalizations. It concatenates several\nintermediate steps of an iterative algorithm for computing the dominant\neigenvectors of a propagation matrix, starting from random node features.\nNotably, these propagation steps are based on graph-dependent propagation\noperators that can be either predefined or learned. We explore the theoretical\nand empirical benefits of RFP. First, we provide theoretical justifications for\nusing random features, for incorporating early propagation steps, and for using\nmultiple random initializations. Then, we empirically demonstrate that RFP\nsignificantly outperforms both spectral PE and random features in multiple node\nclassification and graph classification benchmarks.\n","authors":["Moshe Eliasof","Fabrizio Frasca","Beatrice Bevilacqua","Eran Treister","Gal Chechik","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2303.02918v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.09762v1","updated":"2023-07-19T05:45:05Z","published":"2023-07-19T05:45:05Z","title":"Reinforcing POD based model reduction techniques in reaction-diffusion\n complex networks using stochastic filtering and pattern recognition","summary":" Complex networks are used to model many real-world systems. However, the\ndimensionality of these systems can make them challenging to analyze.\nDimensionality reduction techniques like POD can be used in such cases.\nHowever, these models are susceptible to perturbations in the input data. We\npropose an algorithmic framework that combines techniques from pattern\nrecognition (PR) and stochastic filtering theory to enhance the output of such\nmodels. The results of our study show that our method can improve the accuracy\nof the surrogate model under perturbed inputs. Deep Neural Networks (DNNs) are\nsusceptible to adversarial attacks. However, recent research has revealed that\nneural Ordinary Differential Equations (ODEs) exhibit robustness in specific\napplications. We benchmark our algorithmic framework with a Neural ODE-based\napproach as a reference.\n","authors":["Abhishek Ajayakumar","Soumyendu Raha"],"pdf_url":"https://arxiv.org/pdf/2307.09762v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.04603v3","updated":"2023-07-19T05:43:44Z","published":"2023-07-07T09:01:42Z","title":"Solvent: A Framework for Protein Folding","summary":" Consistency and reliability are crucial for conducting AI research. Many\nfamous research fields, such as object detection, have been compared and\nvalidated with solid benchmark frameworks. After AlphaFold2, the protein\nfolding task has entered a new phase, and many methods are proposed based on\nthe component of AlphaFold2. The importance of a unified research framework in\nprotein folding contains implementations and benchmarks to consistently and\nfairly compare various approaches. To achieve this, we present Solvent, an\nprotein folding framework that supports significant components of\nstate-of-th-arts models in the manner of off-the-shelf interface Solvent\ncontains different models implemented in a unified codebase and supports\ntraining and evaluation for defined models on the same dataset. We benchmark\nwell-known algorithms and their components and provide experiments that give\nhelpful insights into the protein structure modeling field. We hope that\nSolvent will increase the reliability and consistency of proposed models and\ngives efficiency in both speed and costs, resulting in acceleration on protein\nfolding modeling research. The code is available at\nhttps://github.com/kakaobrain/solvent, and the project will continue to be\ndeveloped.\n","authors":["Jaemyung Lee","Kyeongtak Han","Jaehoon Kim","Hasun Yu","Youhan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.04603v3.pdf","comment":"preprint, 8pages"},{"id":"http://arxiv.org/abs/2307.09759v1","updated":"2023-07-19T05:41:40Z","published":"2023-07-19T05:41:40Z","title":"Constructing Extreme Learning Machines with zero Spectral Bias","summary":" The phenomena of Spectral Bias, where the higher frequency components of a\nfunction being learnt in a feedforward Artificial Neural Network (ANN) are seen\nto converge more slowly than the lower frequencies, is observed ubiquitously\nacross ANNs. This has created technology challenges in fields where resolution\nof higher frequencies is crucial, like in Physics Informed Neural Networks\n(PINNs). Extreme Learning Machines (ELMs) that obviate an iterative solution\nprocess which provides the theoretical basis of Spectral Bias (SB), should in\nprinciple be free of the same. This work verifies the reliability of this\nassumption, and shows that it is incorrect. However, the structure of ELMs\nmakes them naturally amenable to implementation of variants of Fourier Feature\nEmbeddings, which have been shown to mitigate SB in ANNs. This approach is\nimplemented and verified to completely eliminate SB, thus bringing into\nfeasibility the application of ELMs for practical problems like PINNs where\nresolution of higher frequencies is essential.\n","authors":["Kaumudi Joshi","Vukka Snigdha","Arya Kumar Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.09759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12239v2","updated":"2023-07-19T05:32:04Z","published":"2023-05-20T17:13:06Z","title":"Off-Policy Average Reward Actor-Critic with Deterministic Policy Search","summary":" The average reward criterion is relatively less studied as most existing\nworks in the Reinforcement Learning literature consider the discounted reward\ncriterion. There are few recent works that present on-policy average reward\nactor-critic algorithms, but average reward off-policy actor-critic is\nrelatively less explored. In this work, we present both on-policy and\noff-policy deterministic policy gradient theorems for the average reward\nperformance criterion. Using these theorems, we also present an Average Reward\nOff-Policy Deep Deterministic Policy Gradient (ARO-DDPG) Algorithm. We first\nshow asymptotic convergence analysis using the ODE-based method. Subsequently,\nwe provide a finite time analysis of the resulting stochastic approximation\nscheme with linear function approximator and obtain an $\\epsilon$-optimal\nstationary policy with a sample complexity of $\\Omega(\\epsilon^{-2.5})$. We\ncompare the average reward performance of our proposed ARO-DDPG algorithm and\nobserve better empirical performance compared to state-of-the-art on-policy\naverage reward actor-critic algorithms over MuJoCo-based environments.\n","authors":["Naman Saxena","Subhojyoti Khastigir","Shishir Kolathaya","Shalabh Bhatnagar"],"pdf_url":"https://arxiv.org/pdf/2305.12239v2.pdf","comment":"Accepted at ICML 2023"},{"id":"http://arxiv.org/abs/2208.06265v2","updated":"2023-07-19T05:08:06Z","published":"2022-08-10T08:28:46Z","title":"Trustworthy Recommender Systems","summary":" Recommender systems (RSs) aim to help users to effectively retrieve items of\ntheir interests from a large catalogue. For a quite long period of time,\nresearchers and practitioners have been focusing on developing accurate RSs.\nRecent years have witnessed an increasing number of threats to RSs, coming from\nattacks, system and user generated noise, system bias. As a result, it has\nbecome clear that a strict focus on RS accuracy is limited and the research\nmust consider other important factors, e.g., trustworthiness. For end users, a\ntrustworthy RS (TRS) should not only be accurate, but also transparent,\nunbiased and fair as well as robust to noise or attacks. These observations\nactually led to a paradigm shift of the research on RSs: from accuracy-oriented\nRSs to TRSs. However, researchers lack a systematic overview and discussion of\nthe literature in this novel and fast developing field of TRSs. To this end, in\nthis paper, we provide an overview of TRSs, including a discussion of the\nmotivation and basic concepts of TRSs, a presentation of the challenges in\nbuilding TRSs, and a perspective on the future directions in this area. We also\nprovide a novel conceptual framework to support the construction of TRSs.\n","authors":["Shoujin Wang","Xiuzhen Zhang","Yan Wang","Huan Liu","Francesco Ricci"],"pdf_url":"https://arxiv.org/pdf/2208.06265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01646v2","updated":"2023-07-19T04:59:35Z","published":"2023-07-04T10:58:42Z","title":"SwinGNN: Rethinking Permutation Invariance in Diffusion Models for Graph\n Generation","summary":" Diffusion models based on permutation-equivariant networks can learn\npermutation-invariant distributions for graph data. However, in comparison to\ntheir non-invariant counterparts, we have found that these invariant models\nencounter greater learning challenges since 1) their effective target\ndistributions exhibit more modes; 2) their optimal one-step denoising scores\nare the score functions of Gaussian mixtures with more components. Motivated by\nthis analysis, we propose a non-invariant diffusion model, called\n$\\textit{SwinGNN}$, which employs an efficient edge-to-edge 2-WL message\npassing network and utilizes shifted window based self-attention inspired by\nSwinTransformers. Further, through systematic ablations, we identify several\ncritical training and sampling techniques that significantly improve the sample\nquality of graph generation. At last, we introduce a simple post-processing\ntrick, $\\textit{i.e.}$, randomly permuting the generated graphs, which provably\nconverts any graph generative model to a permutation-invariant one. Extensive\nexperiments on synthetic and real-world protein and molecule datasets show that\nour SwinGNN achieves state-of-the-art performances. Our code is released at\nhttps://github.com/qiyan98/SwinGNN.\n","authors":["Qi Yan","Zhengyang Liang","Yang Song","Renjie Liao","Lele Wang"],"pdf_url":"https://arxiv.org/pdf/2307.01646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03597v4","updated":"2023-07-19T04:52:33Z","published":"2022-06-07T21:30:58Z","title":"Meta-Learning Parameterized Skills","summary":" We propose a novel parameterized skill-learning algorithm that aims to learn\ntransferable parameterized skills and synthesize them into a new action space\nthat supports efficient learning in long-horizon tasks. We propose to leverage\noff-policy Meta-RL combined with a trajectory-centric smoothness term to learn\na set of parameterized skills. Our agent can use these learned skills to\nconstruct a three-level hierarchical framework that models a\nTemporally-extended Parameterized Action Markov Decision Process. We\nempirically demonstrate that the proposed algorithms enable an agent to solve a\nset of difficult long-horizon (obstacle-course and robot manipulation) tasks.\n","authors":["Haotian Fu","Shangqun Yu","Saket Tiwari","Michael Littman","George Konidaris"],"pdf_url":"https://arxiv.org/pdf/2206.03597v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09742v1","updated":"2023-07-19T04:07:33Z","published":"2023-07-19T04:07:33Z","title":"Improved Distribution Matching for Dataset Condensation","summary":" Dataset Condensation aims to condense a large dataset into a smaller one\nwhile maintaining its ability to train a well-performing model, thus reducing\nthe storage cost and training effort in deep learning applications. However,\nconventional dataset condensation methods are optimization-oriented and\ncondense the dataset by performing gradient or parameter matching during model\noptimization, which is computationally intensive even on small datasets and\nmodels. In this paper, we propose a novel dataset condensation method based on\ndistribution matching, which is more efficient and promising. Specifically, we\nidentify two important shortcomings of naive distribution matching (i.e.,\nimbalanced feature numbers and unvalidated embeddings for distance computation)\nand address them with three novel techniques (i.e., partitioning and expansion\naugmentation, efficient and enriched model sampling, and class-aware\ndistribution regularization). Our simple yet effective method outperforms most\nprevious optimization-oriented methods with much fewer computational resources,\nthereby scaling data condensation to larger datasets and models. Extensive\nexperiments demonstrate the effectiveness of our method. Codes are available at\nhttps://github.com/uitrbn/IDM\n","authors":["Ganlong Zhao","Guanbin Li","Yipeng Qin","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2307.09742v1.pdf","comment":"CVPR2023"},{"id":"http://arxiv.org/abs/2302.11665v2","updated":"2023-07-19T04:03:11Z","published":"2023-02-22T21:41:34Z","title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep\n Learning Serving","summary":" Model parallelism is conventionally viewed as a method to scale a single\nlarge deep learning model beyond the memory limits of a single device. In this\npaper, we demonstrate that model parallelism can be additionally used for the\nstatistical multiplexing of multiple devices when serving multiple models, even\nwhen a single model can fit into a single device. Our work reveals a\nfundamental trade-off between the overhead introduced by model parallelism and\nthe opportunity to exploit statistical multiplexing to reduce serving latency\nin the presence of bursty workloads. We explore the new trade-off space and\npresent a novel serving system, AlpaServe, that determines an efficient\nstrategy for placing and parallelizing collections of large deep learning\nmodels across a distributed cluster. Evaluation results on production workloads\nshow that AlpaServe can process requests at up to 10x higher rates or 6x more\nburstiness while staying within latency constraints for more than 99% of\nrequests.\n","authors":["Zhuohan Li","Lianmin Zheng","Yinmin Zhong","Vincent Liu","Ying Sheng","Xin Jin","Yanping Huang","Zhifeng Chen","Hao Zhang","Joseph E. Gonzalez","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2302.11665v2.pdf","comment":"OSDI 2023"},{"id":"http://arxiv.org/abs/2305.16165v2","updated":"2023-07-19T02:42:46Z","published":"2023-05-11T21:20:29Z","title":"A Conceptual Model for End-to-End Causal Discovery in Knowledge Tracing","summary":" In this paper, we take a preliminary step towards solving the problem of\ncausal discovery in knowledge tracing, i.e., finding the underlying causal\nrelationship among different skills from real-world student response data. This\nproblem is important since it can potentially help us understand the causal\nrelationship between different skills without extensive A/B testing, which can\npotentially help educators to design better curricula according to skill\nprerequisite information. Specifically, we propose a conceptual solution, a\nnovel causal gated recurrent unit (GRU) module in a modified deep knowledge\ntracing model, which uses i) a learnable permutation matrix for causal ordering\namong skills and ii) an optionally learnable lower-triangular matrix for causal\nstructure among skills. We also detail how to learn the model parameters in an\nend-to-end, differentiable way. Our solution placed among the top entries in\nTask 3 of the NeurIPS 2022 Challenge on Causal Insights for Learning Paths in\nEducation. We detail preliminary experiments as evaluated on the challenge's\npublic leaderboard since the ground truth causal structure has not been\npublicly released, making detailed local evaluation impossible.\n","authors":["Nischal Ashok Kumar","Wanyong Feng","Jaewook Lee","Hunter McNichols","Aritra Ghosh","Andrew Lan"],"pdf_url":"https://arxiv.org/pdf/2305.16165v2.pdf","comment":"16th International Conference on Educational Data Mining (EDM 2023)"},{"id":"http://arxiv.org/abs/2305.00909v4","updated":"2023-07-19T02:41:58Z","published":"2023-04-28T01:47:09Z","title":"Outline, Then Details: Syntactically Guided Coarse-To-Fine Code\n Generation","summary":" For a complicated algorithm, its implementation by a human programmer usually\nstarts with outlining a rough control flow followed by iterative enrichments,\neventually yielding carefully generated syntactic structures and variables in a\nhierarchy. However, state-of-the-art large language models generate codes in a\nsingle pass, without intermediate warm-ups to reflect the structured thought\nprocess of \"outline-then-detail\". Inspired by the recent success of\nchain-of-thought prompting, we propose ChainCoder, a program synthesis language\nmodel that generates Python code progressively, i.e. from coarse to fine in\nmultiple passes. We first decompose source code into layout frame components\nand accessory components via abstract syntax tree parsing to construct a\nhierarchical representation. We then reform our prediction target into a\nmulti-pass objective, each pass generates a subsequence, which is concatenated\nin the hierarchy. Finally, a tailored transformer architecture is leveraged to\njointly encode the natural language descriptions and syntactically aligned I/O\ndata samples. Extensive evaluations show that ChainCoder outperforms\nstate-of-the-arts, demonstrating that our progressive generation eases the\nreasoning procedure and guides the language model to generate higher-quality\nsolutions. Our codes are available at:\nhttps://github.com/VITA-Group/ChainCoder.\n","authors":["Wenqing Zheng","S P Sharan","Ajay Kumar Jaiswal","Kevin Wang","Yihan Xi","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2305.00909v4.pdf","comment":"Accepted in ICML 2023"},{"id":"http://arxiv.org/abs/2307.09706v1","updated":"2023-07-19T01:37:31Z","published":"2023-07-19T01:37:31Z","title":"RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap","summary":" Taxonomies are an essential knowledge representation, yet most studies on\nautomatic taxonomy construction (ATC) resort to manual evaluation to score\nproposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just\nas important as taxonomy construction. We propose RaTE, an automatic label-free\ntaxonomy scoring procedure, which relies on a large pre-trained language model.\nWe apply our evaluation procedure to three state-of-the-art ATC algorithms with\nwhich we built seven taxonomies from the Yelp domain, and show that 1) RaTE\ncorrelates well with human judgments and 2) artificially degrading a taxonomy\nleads to decreasing RaTE score.\n","authors":["Tianjian Gao","Phillipe Langlais"],"pdf_url":"https://arxiv.org/pdf/2307.09706v1.pdf","comment":"15th International Conference on Computational Semantics (IWCS),\n Association for Computational Linguistics (ACL)"},{"id":"http://arxiv.org/abs/2307.03135v2","updated":"2023-07-19T01:28:30Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n Generalizability","summary":" Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Code released at\nhttps://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v2.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.09702v1","updated":"2023-07-19T01:14:49Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for LLMs","summary":" In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09692v1","updated":"2023-07-19T00:31:58Z","published":"2023-07-19T00:31:58Z","title":"STRAPPER: Preference-based Reinforcement Learning via Self-training\n Augmentation and Peer Regularization","summary":" Preference-based reinforcement learning (PbRL) promises to learn a complex\nreward function with binary human preference. However, such human-in-the-loop\nformulation requires considerable human effort to assign preference labels to\nsegment pairs, hindering its large-scale applications. Recent approache has\ntried to reuse unlabeled segments, which implicitly elucidates the distribution\nof segments and thereby alleviates the human effort. And consistency\nregularization is further considered to improve the performance of\nsemi-supervised learning. However, we notice that, unlike general\nclassification tasks, in PbRL there exits a unique phenomenon that we defined\nas similarity trap in this paper. Intuitively, human can have diametrically\nopposite preferredness for similar segment pairs, but such similarity may trap\nconsistency regularization fail in PbRL. Due to the existence of similarity\ntrap, such consistency regularization improperly enhances the consistency\npossiblity of the model's predictions between segment pairs, and thus reduces\nthe confidence in reward learning, since the augmented distribution does not\nmatch with the original one in PbRL. To overcome such issue, we present a\nself-training method along with our proposed peer regularization, which\npenalizes the reward model memorizing uninformative labels and acquires\nconfident predictions. Empirically, we demonstrate that our approach is capable\nof learning well a variety of locomotion and robotic manipulation behaviors\nusing different semi-supervised alternatives and peer regularization.\n","authors":["Yachen Kang","Li He","Jinxin Liu","Zifeng Zhuang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09691v1","updated":"2023-07-19T00:27:49Z","published":"2023-07-19T00:27:49Z","title":"Joint Service Caching, Communication and Computing Resource Allocation\n in Collaborative MEC Systems: A DRL-based Two-timescale Approach","summary":" Meeting the strict Quality of Service (QoS) requirements of terminals has\nimposed a signiffcant challenge on Multiaccess Edge Computing (MEC) systems,\ndue to the limited multidimensional resources. To address this challenge, we\npropose a collaborative MEC framework that facilitates resource sharing between\nthe edge servers, and with the aim to maximize the long-term QoS and reduce the\ncache switching cost through joint optimization of service caching,\ncollaborative offfoading, and computation and communication resource\nallocation. The dual timescale feature and temporal recurrence relationship\nbetween service caching and other resource allocation make solving the problem\neven more challenging. To solve it, we propose a deep reinforcement learning\n(DRL)-based dual timescale scheme, called DGL-DDPG, which is composed of a\nshort-term genetic algorithm (GA) and a long short-term memory network-based\ndeep deterministic policy gradient (LSTM-DDPG). In doing so, we reformulate the\noptimization problem as a Markov decision process (MDP) where the\nsmall-timescale resource allocation decisions generated by an improved GA are\ntaken as the states and input into a centralized LSTM-DDPG agent to generate\nthe service caching decision for the large-timescale. Simulation results\ndemonstrate that our proposed algorithm outperforms the baseline algorithms in\nterms of the average QoS and cache switching cost.\n","authors":["Qianqian Liu","Haixia Zhang","Xin Zhang","Dongfeng Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.09691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09688v1","updated":"2023-07-19T00:08:49Z","published":"2023-07-19T00:08:49Z","title":"Amazon-M2: A Multilingual Multi-locale Shopping Session Dataset for\n Recommendation and Text Generation","summary":" Modeling customer shopping intentions is a crucial task for e-commerce, as it\ndirectly impacts user experience and engagement. Thus, accurately understanding\ncustomer preferences is essential for providing personalized recommendations.\nSession-based recommendation, which utilizes customer session data to predict\ntheir next interaction, has become increasingly popular. However, existing\nsession datasets have limitations in terms of item attributes, user diversity,\nand dataset scale. As a result, they cannot comprehensively capture the\nspectrum of user behaviors and preferences. To bridge this gap, we present the\nAmazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It\nis the first multilingual dataset consisting of millions of user sessions from\nsix different locales, where the major languages of products are English,\nGerman, Japanese, French, Italian, and Spanish. Remarkably, the dataset can\nhelp us enhance personalization and understanding of user preferences, which\ncan benefit various existing tasks as well as enable new tasks. To test the\npotential of the dataset, we introduce three tasks in this work: (1)\nnext-product recommendation, (2) next-product recommendation with domain\nshifts, and (3) next-product title generation. With the above tasks, we\nbenchmark a range of algorithms on our proposed dataset, drawing new insights\nfor further research and practice. In addition, based on the proposed dataset\nand tasks, we hosted a competition in the KDD CUP 2023 and have attracted\nthousands of users and submissions. The winning solutions and the associated\nworkshop can be accessed at our website https://kddcup23.github.io/.\n","authors":["Wei Jin","Haitao Mao","Zheng Li","Haoming Jiang","Chen Luo","Hongzhi Wen","Haoyu Han","Hanqing Lu","Zhengyang Wang","Ruirui Li","Zhen Li","Monica Xiao Cheng","Rahul Goutam","Haiyang Zhang","Karthik Subbian","Suhang Wang","Yizhou Sun","Jiliang Tang","Bing Yin","Xianfeng Tang"],"pdf_url":"https://arxiv.org/pdf/2307.09688v1.pdf","comment":"Dataset for KDD Cup 2023, https://kddcup23.github.io/"},{"id":"http://arxiv.org/abs/2210.01834v2","updated":"2023-07-19T23:50:47Z","published":"2022-10-04T18:06:29Z","title":"Invariant Aggregator for Defending against Federated Backdoor Attacks","summary":" Federated learning is gaining popularity as it enables training high-utility\nmodels across several clients without directly sharing their private data. As a\ndownside, the federated setting makes the model vulnerable to various\nadversarial attacks in the presence of malicious clients. Despite the\ntheoretical and empirical success in defending against attacks that aim to\ndegrade models' utility, defense against backdoor attacks that increase model\naccuracy on backdoor samples exclusively without hurting the utility on other\nsamples remains challenging. To this end, we first analyze the vulnerability of\nfederated learning to backdoor attacks over a flat loss landscape which is\ncommon for well-designed neural networks such as Resnet [He et al., 2015] but\nis often overlooked by previous works. Over a flat loss landscape, misleading\nfederated learning models to exclusively benefit malicious clients with\nbackdoor samples do not require a significant difference between malicious and\nbenign client-wise updates, making existing defenses insufficient. In contrast,\nwe propose an invariant aggregator that redirects the aggregated update to\ninvariant directions that are generally useful via selectively masking out the\ngradient elements that favor few and possibly malicious clients regardless of\nthe difference magnitude. Theoretical results suggest that our approach\nprovably mitigates backdoor attacks over both flat and sharp loss landscapes.\nEmpirical results on three datasets with different modalities and varying\nnumbers of clients further demonstrate that our approach mitigates a broad\nclass of backdoor attacks with a negligible cost on the model utility.\n","authors":["Xiaoyang Wang","Dimitrios Dimitriadis","Sanmi Koyejo","Shruti Tople"],"pdf_url":"https://arxiv.org/pdf/2210.01834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12877v2","updated":"2023-07-19T23:38:55Z","published":"2022-07-26T13:12:22Z","title":"Representing Random Utility Choice Models with Neural Networks","summary":" Motivated by the successes of deep learning, we propose a class of neural\nnetwork-based discrete choice models, called RUMnets, inspired by the random\nutility maximization (RUM) framework. This model formulates the agents' random\nutility function using a sample average approximation. We show that RUMnets\nsharply approximate the class of RUM discrete choice models: any model derived\nfrom random utility maximization has choice probabilities that can be\napproximated arbitrarily closely by a RUMnet. Reciprocally, any RUMnet is\nconsistent with the RUM principle. We derive an upper bound on the\ngeneralization error of RUMnets fitted on choice data, and gain theoretical\ninsights on their ability to predict choices on new, unseen data depending on\ncritical parameters of the dataset and architecture. By leveraging open-source\nlibraries for neural networks, we find that RUMnets are competitive against\nseveral choice modeling and machine learning methods in terms of predictive\naccuracy on two real-world datasets.\n","authors":["Ali Aouad","Antoine Désir"],"pdf_url":"https://arxiv.org/pdf/2207.12877v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2207.00419v3","updated":"2023-07-19T16:00:08Z","published":"2022-06-18T00:26:52Z","title":"Self-Supervised Learning for Videos: A Survey","summary":" The remarkable success of deep learning in various domains relies on the\navailability of large-scale annotated datasets. However, obtaining annotations\nis expensive and requires great effort, which is especially challenging for\nvideos. Moreover, the use of human-generated annotations leads to models with\nbiased learning and poor domain generalization and robustness. As an\nalternative, self-supervised learning provides a way for representation\nlearning which does not require annotations and has shown promise in both image\nand video domains. Different from the image domain, learning video\nrepresentations are more challenging due to the temporal dimension, bringing in\nmotion and other environmental dynamics. This also provides opportunities for\nvideo-exclusive ideas that advance self-supervised learning in the video and\nmultimodal domain. In this survey, we provide a review of existing approaches\non self-supervised learning focusing on the video domain. We summarize these\nmethods into four different categories based on their learning objectives: 1)\npretext tasks, 2) generative learning, 3) contrastive learning, and 4)\ncross-modal agreement. We further introduce the commonly used datasets,\ndownstream evaluation tasks, insights into the limitations of existing works,\nand the potential future directions in this area.\n","authors":["Madeline C. Schiappa","Yogesh S. Rawat","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2207.00419v3.pdf","comment":"ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q"},{"id":"http://arxiv.org/abs/2307.10003v1","updated":"2023-07-19T14:23:26Z","published":"2023-07-19T14:23:26Z","title":"TbExplain: A Text-based Explanation Method for Scene Classification\n Models with the Statistical Prediction Correction","summary":" The field of Explainable Artificial Intelligence (XAI) aims to improve the\ninterpretability of black-box machine learning models. Building a heatmap based\non the importance value of input features is a popular method for explaining\nthe underlying functions of such models in producing their predictions.\nHeatmaps are almost understandable to humans, yet they are not without flaws.\nNon-expert users, for example, may not fully understand the logic of heatmaps\n(the logic in which relevant pixels to the model's prediction are highlighted\nwith different intensities or colors). Additionally, objects and regions of the\ninput image that are relevant to the model prediction are frequently not\nentirely differentiated by heatmaps. In this paper, we propose a framework\ncalled TbExplain that employs XAI techniques and a pre-trained object detector\nto present text-based explanations of scene classification models. Moreover,\nTbExplain incorporates a novel method to correct predictions and textually\nexplain them based on the statistics of objects in the input image when the\ninitial prediction is unreliable. To assess the trustworthiness and validity of\nthe text-based explanations, we conducted a qualitative experiment, and the\nfindings indicated that these explanations are sufficiently reliable.\nFurthermore, our quantitative and qualitative experiments on TbExplain with\nscene classification datasets reveal an improvement in classification accuracy\nover ResNet variants.\n","authors":["Amirhossein Aminimehr","Pouya Khani","Amirali Molaei","Amirmohammad Kazemeini","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.10003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09936v1","updated":"2023-07-19T12:21:39Z","published":"2023-07-19T12:21:39Z","title":"AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point\n Clouds of Deformable Objects","summary":" This paper focuses on motion prediction for point cloud sequences in the\nchallenging case of deformable 3D objects, such as human body motion. First, we\ninvestigate the challenges caused by deformable shapes and complex motions\npresent in this type of representation, with the ultimate goal of understanding\nthe technical limitations of state-of-the-art models. From this understanding,\nwe propose an improved architecture for point cloud prediction of deformable 3D\nobjects. Specifically, to handle deformable shapes, we propose a graph-based\napproach that learns and exploits the spatial structure of point clouds to\nextract more representative features. Then we propose a module able to combine\nthe learned features in an adaptative manner according to the point cloud\nmovements. The proposed adaptative module controls the composition of local and\nglobal motions for each point, enabling the network to model complex motions in\ndeformable 3D objects more effectively. We tested the proposed method on the\nfollowing datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG\nand CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that\nour method outperforms the current baseline methods given its improved ability\nto model complex movements as well as preserve point cloud shape. Furthermore,\nwe demonstrate the generalizability of the proposed framework for dynamic\nfeature learning, by testing the framework for action recognition on the\nMSRAction3D dataset and achieving results on-par with state-of-the-art methods\n","authors":["Pedro Gomes","Silvia Rossi","Laura Toni"],"pdf_url":"https://arxiv.org/pdf/2307.09936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09915v1","updated":"2023-07-19T11:35:21Z","published":"2023-07-19T11:35:21Z","title":"Embedded Heterogeneous Attention Transformer for Cross-lingual Image\n Captioning","summary":" Cross-lingual image captioning is confronted with both cross-lingual and\ncross-modal challenges for multimedia analysis. The crucial issue in this task\nis to model the global and local matching between the image and different\nlanguages. Existing cross-modal embedding methods based on Transformer\narchitecture oversight the local matching between the image region and\nmonolingual words, not to mention in the face of a variety of differentiated\nlanguages. Due to the heterogeneous property of the cross-modal and\ncross-lingual task, we utilize the heterogeneous network to establish\ncross-domain relationships and the local correspondences between the image and\ndifferent languages. In this paper, we propose an Embedded Heterogeneous\nAttention Transformer (EHAT) to build reasoning paths bridging cross-domain for\ncross-lingual image captioning and integrate into transformer. The proposed\nEHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous\nAttention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN\nas the core network, models and infers cross-domain relationship anchored by\nvision bounding box representation features to connect two languages word\nfeatures and learn the heterogeneous maps. MHCA and HCA implement cross-domain\nintegration in the encoder through the special heterogeneous attention and\nenable single model to generate two language captioning. We test on MSCOCO\ndataset to generate English and Chinese, which are most widely used and have\nobvious difference between their language families. Our experiments show that\nour method even achieve better than advanced monolingual methods.\n","authors":["Zijie Song","Zhenzhen Hu","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2307.09915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09821v1","updated":"2023-07-19T08:16:34Z","published":"2023-07-19T08:16:34Z","title":"Hierarchical Semantic Perceptual Listener Head Video Generation: A\n High-performance Pipeline","summary":" In dyadic speaker-listener interactions, the listener's head reactions along\nwith the speaker's head movements, constitute an important non-verbal semantic\nexpression together. The listener Head generation task aims to synthesize\nresponsive listener's head videos based on audios of the speaker and reference\nimages of the listener. Compared to the Talking-head generation, it is more\nchallenging to capture the correlation clues from the speaker's audio and\nvisual information. Following the ViCo baseline scheme, we propose a\nhigh-performance solution by enhancing the hierarchical semantic extraction\ncapability of the audio encoder module and improving the decoder part, renderer\nand post-processing modules. Our solution gets the first place on the official\nleaderboard for the track of listening head generation. This paper is a\ntechnical report of ViCo@2023 Conversational Head Generation Challenge in ACM\nMultimedia 2023 conference.\n","authors":["Zhigang Chang","Weitai Hu","Qing Yang","Shibao Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.09821v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.07848v5","updated":"2023-07-19T04:56:33Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n Pretraining for Speech Emotion Recognition","summary":" Contrastive learning based cross-modality pretraining methods have recently\nexhibited impressive success in diverse fields. In this paper, we propose\nGEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio\npretraining (CLAP) method for speech emotion recognition. Specifically, a novel\nemotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised\npre-trained models. Second, considering the importance of gender attribute in\nspeech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and\nmulti-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to\nintegrate the emotion and gender information of speech signals, forming more\nreasonable objectives. Extensive experiments on IEMOCAP show that our proposed\ntwo GEmo-CLAP models consistently outperform the baseline Emo-CLAP with\ndifferent pre-trained models, while also achieving the best recognition\nperformance compared with recent state-of-the-art methods. Noticeably, the\nproposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\\% and WAR of\n82.06\\%.\n","authors":["Yu Pan","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2306.07848v5.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2307.09729v1","updated":"2023-07-19T02:33:42Z","published":"2023-07-19T02:33:42Z","title":"NTIRE 2023 Quality Assessment of Video Enhancement Challenge","summary":" This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement\nChallenge, which will be held in conjunction with the New Trends in Image\nRestoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to\naddress a major challenge in the field of video processing, namely, video\nquality assessment (VQA) for enhanced videos. The challenge uses the VQA\nDataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211\nenhanced videos, including 600 videos with color, brightness, and contrast\nenhancements, 310 videos with deblurring, and 301 deshaked videos. The\nchallenge has a total of 167 registered participants. 61 participating teams\nsubmitted their prediction results during the development phase, with a total\nof 3168 submissions. A total of 176 submissions were submitted by 37\nparticipating teams during the final testing phase. Finally, 19 participating\nteams submitted their models and fact sheets, and detailed the methods they\nused. Some methods have achieved better results than baseline methods, and the\nwinning methods have demonstrated superior prediction performance.\n","authors":["Xiaohong Liu","Xiongkuo Min","Wei Sun","Yulun Zhang","Kai Zhang","Radu Timofte","Guangtao Zhai","Yixuan Gao","Yuqin Cao","Tengchuan Kou","Yunlong Dong","Ziheng Jia","Yilin Li","Wei Wu","Shuming Hu","Sibin Deng","Pengxiang Xiao","Ying Chen","Kai Li","Kai Zhao","Kun Yuan","Ming Sun","Heng Cong","Hao Wang","Lingzhi Fu","Yusheng Zhang","Rongyu Zhang","Hang Shi","Qihang Xu","Longan Xiao","Zhiliang Ma","Mirko Agarla","Luigi Celona","Claudio Rota","Raimondo Schettini","Zhiwei Huang","Yanan Li","Xiaotao Wang","Lei Lei","Hongye Liu","Wei Hong","Ironhead Chuang","Allen Lin","Drake Guan","Iris Chen","Kae Lou","Willy Huang","Yachun Tasi","Yvonne Kao","Haotian Fan","Fangyuan Kong","Shiqi Zhou","Hao Liu","Yu Lai","Shanshan Chen","Wenqi Wang","Haoning Wu","Chaofeng Chen","Chunzheng Zhu","Zekun Guo","Shiling Zhao","Haibing Yin","Hongkui Wang","Hanene Brachemi Meftah","Sid Ahmed Fezza","Wassim Hamidouche","Olivier Déforges","Tengfei Shi","Azadeh Mansouri","Hossein Motamednia","Amir Hossein Bakhtiari","Ahmad Mahmoudi Aznaveh"],"pdf_url":"https://arxiv.org/pdf/2307.09729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10346v1","updated":"2023-07-19T17:10:56Z","published":"2023-07-19T17:10:56Z","title":"Estudio de la Experiencia de Usuario mediante un Sistema de Dashboards\n de Análisis de Aprendizaje Multimodal","summary":" In the article, we present a Web-based System called M2LADS, which supports\nthe integration and visualization of multimodal data recorded in user\nexperiences (UX) in a Learning Analytics (LA) system in the form of Web-based\nDashboards. Based on the edBB platform, the multimodal data gathered contains\nbiometric and behavioral signals including electroencephalogram data to measure\nlearners' cognitive attention, heart rate for affective measures and visual\nattention from the video recordings. Additionally, learners' static background\ndata and their learning performance measures are tracked using LOGGE tool.\nM2LADS provides opportunities to capture learners' holistic experience during\ntheir interactions with the learning analytic system in order to improve the\nsystem and the user experience of the learners.\n --\n En este art\\'iculo, presentamos M2LADS, un sistema que permite la\nintegraci\\'on y visualizaci\\'on de datos multimodales en forma de Dashboards\nWeb. Estos datos provienen de sesiones de experiencia de usuario en un sistema\nde Learning Analytics (LA) llevadas a cabo por estudiantes de MOOCs. Los datos\nmultimodales incluyen se\\~nales biom\\'etricas y de comportamiento monitorizados\npor la plataforma edBB, como electroencefalogramas (EEG) de 5 canales,\nfrecuencia card\\'iaca, atenci\\'on visual, videos en el espectro visible y NIR,\nentre otros. Adem\\'as, se incluyen datos de interacci\\'on de los estudiantes\ncon el sistema de LA a trav\\'es de la herramienta LOGGE. Toda esta\ninformaci\\'on proporciona una comprensi\\'on completa de la experiencia del\nusuario al utilizar el sistema de LA, lo que ha permitido tanto mejorar el\nsistema LA como la experiencia de aprendizaje de los estudiantes de MOOCs.\n","authors":["Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2307.10346v1.pdf","comment":"Accepted in \"XXIII CONGRESO INTERNACIONAL DE INTERACCI\\'ON\n PERSONA-ORDENADOR 2023\". Article in Spanish language. The abstract in English\n and Spanish. There is an extended abstract of 2 pages in English"}]},"2023-07-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.11088v1","updated":"2023-07-20T17:59:41Z","published":"2023-07-20T17:59:41Z","title":"L-Eval: Instituting Standardized Evaluation for Long Context Language\n Models","summary":" Recently, there has been growing interest in extending the context length of\ninstruction-following models in order to effectively process single-turn long\ninput (e.g. summarizing a paper) and conversations with more extensive\nhistories. While proprietary models such as GPT-4 and Claude have demonstrated\nconsiderable advancements in handling tens of thousands of tokens of context,\nopen-sourced models are still in the early stages of experimentation. It also\nremains unclear whether developing these long context models can offer\nsubstantial gains on practical downstream tasks over retrieval-based methods or\nmodels simply trained on chunked contexts. To address this challenge, we\npropose to institute standardized evaluation for long context language models.\nConcretely, we develop L-Eval which contains 411 long documents and over 2,000\nquery-response pairs manually annotated and checked by the authors encompassing\nareas such as law, finance, school lectures, lengthy conversations, news,\nlong-form novels, and meetings. L-Eval also adopts diverse evaluation methods\nand instruction styles, enabling a more reliable assessment of Long Context\nLanguage Models (LCLMs). Our findings indicate that while open-source models\ntypically lag behind their commercial counterparts, they still exhibit\nimpressive performance. LLaMA2 achieves the best results (win 45\\% vs\nturbo-16k) on open-ended tasks with only 4k context length and ChatGLM2\nachieves the best results on closed-ended tasks with 8k input tokens. We\nrelease our new evaluation suite, code, and all generation results including\npredictions from all open-sourced LCLMs, GPT4-32k, Cluade-100k at\n{\\url{https://github.com/OpenLMLab/LEval}}.\n","authors":["Chenxin An","Shansan Gong","Ming Zhong","Mukai Li","Jun Zhang","Lingpeng Kong","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2307.11088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10172v2","updated":"2023-07-20T17:59:35Z","published":"2023-07-19T17:57:53Z","title":"DialogStudio: Towards Richest and Most Diverse Unified Dataset\n Collection for Conversational AI","summary":" Despite advancements in conversational AI, language models encounter\nchallenges to handle diverse conversational tasks, and existing dialogue\ndataset collections often lack diversity and comprehensiveness. To tackle these\nissues, we introduce DialogStudio: the largest and most diverse collection of\ndialogue datasets, unified under a consistent format while preserving their\noriginal information. Our collection encompasses data from open-domain\ndialogues, task-oriented dialogues, natural language understanding,\nconversational recommendation, dialogue summarization, and knowledge-grounded\ndialogues, making it an incredibly rich and diverse resource for dialogue\nresearch and model training. To further enhance the utility of DialogStudio, we\nidentify the licenses for each dataset and design domain-aware prompts for\nselected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we\ndevelop conversational AI models using the dataset collection, and our\nexperiments in both zero-shot and few-shot learning scenarios demonstrate the\nsuperiority of DialogStudio. To improve transparency and support dataset and\ntask-based research, as well as language model pre-training, all datasets,\nlicenses, codes, and models associated with DialogStudio are made publicly\naccessible at https://github.com/salesforce/DialogStudio\n","authors":["Jianguo Zhang","Kun Qian","Zhiwei Liu","Shelby Heinecke","Rui Meng","Ye Liu","Zhou Yu","Huan Wang","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2307.10172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13867v2","updated":"2023-07-20T17:59:14Z","published":"2023-01-31T18:59:03Z","title":"Mathematical Capabilities of ChatGPT","summary":" We investigate the mathematical capabilities of two iterations of ChatGPT\n(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on\npublicly available datasets, as well as hand-crafted ones, using a novel\nmethodology. In contrast to formal mathematics, where large databases of formal\nproofs are available (e.g., the Lean Mathematical Library), current datasets of\nnatural-language mathematics, used to benchmark language models, either cover\nonly elementary mathematics or are very small. We address this by publicly\nreleasing two new datasets: GHOSTS and miniGHOSTS. These are the first\nnatural-language datasets curated by working researchers in mathematics that\n(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of\nthe mathematical capabilities of language models, and (3) distinguish multiple\ndimensions of mathematical reasoning. These datasets also test whether ChatGPT\nand GPT-4 can be helpful assistants to professional mathematicians by emulating\nuse cases that arise in the daily professional activities of mathematicians. We\nbenchmark the models on a range of fine-grained performance metrics. For\nadvanced mathematics, this is the most detailed evaluation effort to date. We\nfind that ChatGPT can be used most successfully as a mathematical assistant for\nquerying facts, acting as a mathematical search engine and knowledge base\ninterface. GPT-4 can additionally be used for undergraduate-level mathematics\nbut fails on graduate-level difficulty. Contrary to many positive reports in\nthe media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of\nselection bias), their overall mathematical performance is well below the level\nof a graduate student. Hence, if your goal is to use ChatGPT to pass a\ngraduate-level math exam, you would be better off copying from your average\npeer!\n","authors":["Simon Frieder","Luca Pinchetti","Alexis Chevalier","Ryan-Rhys Griffiths","Tommaso Salvatori","Thomas Lukasiewicz","Philipp Christian Petersen","Julius Berner"],"pdf_url":"https://arxiv.org/pdf/2301.13867v2.pdf","comment":"Added further evaluations on another ChatGPT version and on GPT-4.\n The GHOSTS and miniGHOSTS datasets are available at\n https://github.com/xyfrieder/science-GHOSTS"},{"id":"http://arxiv.org/abs/2304.07880v3","updated":"2023-07-20T17:34:39Z","published":"2023-04-16T20:11:19Z","title":"Sabiá: Portuguese Large Language Models","summary":" As the capabilities of language models continue to advance, it is conceivable\nthat \"one-size-fits-all\" model will remain as the main paradigm. For instance,\ngiven the vast number of languages worldwide, many of which are low-resource,\nthe prevalent practice is to pretrain a single model on multiple languages. In\nthis paper, we add to the growing body of evidence that challenges this\npractice, demonstrating that monolingual pretraining on the target language\nsignificantly improves models already extensively trained on diverse corpora.\nMore specifically, we further pretrain GPT-J and LLaMA models on Portuguese\ntexts using 3% or less of their original pretraining budget. Few-shot\nevaluations on Poeta, a suite of 14 Portuguese datasets, reveal that our models\noutperform English-centric and multilingual counterparts by a significant\nmargin. Our best model, Sabi\\'a-65B, performs on par with GPT-3.5-turbo. By\nevaluating on datasets originally conceived in the target language as well as\ntranslated ones, we study the contributions of language-specific pretraining in\nterms of 1) capturing linguistic nuances and structures inherent to the target\nlanguage, and 2) enriching the model's knowledge about a domain or culture. Our\nresults indicate that the majority of the benefits stem from the\ndomain-specific knowledge acquired through monolingual pretraining.\n","authors":["Ramon Pires","Hugo Abonizio","Thales Sales Almeida","Rodrigo Nogueira"],"pdf_url":"https://arxiv.org/pdf/2304.07880v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11031v1","updated":"2023-07-20T17:07:28Z","published":"2023-07-20T17:07:28Z","title":"Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot\n Classification","summary":" Recent work has shown that language models' (LMs) prompt-based learning\ncapabilities make them well suited for automating data labeling in domains\nwhere manual annotation is expensive. The challenge is that while writing an\ninitial prompt is cheap, improving a prompt is costly -- practitioners often\nrequire significant labeled data in order to evaluate the impact of prompt\nmodifications. Our work asks whether it is possible to improve prompt-based\nlearning without additional labeled data. We approach this problem by\nattempting to modify the predictions of a prompt, rather than the prompt\nitself. Our intuition is that accurate predictions should also be consistent:\nsamples which are similar under some feature representation should receive the\nsame prompt prediction. We propose Embroid, a method which computes multiple\nrepresentations of a dataset under different embedding functions, and uses the\nconsistency between the LM predictions for neighboring samples to identify\nmispredictions. Embroid then uses these neighborhoods to create additional\npredictions for each sample, and combines these predictions with a simple\nlatent variable graphical model in order to generate a final corrected\nprediction. In addition to providing a theoretical analysis of Embroid, we\nconduct a rigorous empirical evaluation across six different LMs and up to 95\ndifferent tasks. We find that (1) Embroid substantially improves performance\nover original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also\nrealizes improvements for more sophisticated prompting strategies (e.g.,\nchain-of-thought), and (3) can be specialized to domains like law through the\nembedding functions.\n","authors":["Neel Guha","Mayee F. Chen","Kush Bhatia","Azalia Mirhoseini","Frederic Sala","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2307.11031v1.pdf","comment":"38 pages, 22 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.10811v1","updated":"2023-07-20T16:55:25Z","published":"2023-07-20T16:55:25Z","title":"\"It Felt Like Having a Second Mind\": Investigating Human-AI\n Co-creativity in Prewriting with Large Language Models","summary":" Prewriting is the process of discovering and developing ideas before a first\ndraft, which requires divergent thinking and often implies unstructured\nstrategies such as diagramming, outlining, free-writing, etc. Although large\nlanguage models (LLMs) have been demonstrated to be useful for a variety of\ntasks including creative writing, little is known about how users would\ncollaborate with LLMs to support prewriting. The preferred collaborative role\nand initiative of LLMs during such a creativity process is also unclear. To\ninvestigate human-LLM collaboration patterns and dynamics during prewriting, we\nconducted a three-session qualitative study with 15 participants in two\ncreative tasks: story writing and slogan writing. The findings indicated that\nduring collaborative prewriting, there appears to be a three-stage iterative\nHuman-AI Co-creativity process that includes Ideation, Illumination, and\nImplementation stages. This collaborative process champions the human in a\ndominant role, in addition to mixed and shifting levels of initiative that\nexist between humans and LLMs. This research also reports on collaboration\nbreakdowns that occur during this process, user perceptions of using existing\nLLMs during Human-AI Co-creativity, and discusses design implications to\nsupport this co-creativity process.\n","authors":["Qian Wan","Siying Hu","Yu Zhang","Piaohong Wang","Bo Wen","Zhicong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10811v1.pdf","comment":"Under review at CSCW after a Major Revision"},{"id":"http://arxiv.org/abs/2307.11019v1","updated":"2023-07-20T16:46:10Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n with Retrieval Augmentation","summary":" Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11005v1","updated":"2023-07-20T16:34:40Z","published":"2023-07-20T16:34:40Z","title":"Integrating Pretrained ASR and LM to Perform Sequence Generation for\n Spoken Language Understanding","summary":" There has been an increased interest in the integration of pretrained speech\nrecognition (ASR) and language models (LM) into the SLU framework. However,\nprior methods often struggle with a vocabulary mismatch between pretrained\nmodels, and LM cannot be directly utilized as they diverge from its NLU\nformulation. In this study, we propose a three-pass end-to-end (E2E) SLU system\nthat effectively integrates ASR and LM subnetworks into the SLU formulation for\nsequence generation tasks. In the first pass, our architecture predicts ASR\ntranscripts using the ASR subnetwork. This is followed by the LM subnetwork,\nwhich makes an initial SLU prediction. Finally, in the third pass, the\ndeliberation subnetwork conditions on representations from the ASR and LM\nsubnetworks to make the final prediction. Our proposed three-pass SLU system\nshows improved performance over cascaded and E2E SLU models on two benchmark\nSLU datasets, SLURP and SLUE, especially on acoustically challenging\nutterances.\n","authors":["Siddhant Arora","Hayato Futami","Yosuke Kashiwagi","Emiru Tsunoo","Brian Yan","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2307.11005v1.pdf","comment":"Accepted at INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2210.05335v3","updated":"2023-07-20T16:24:14Z","published":"2022-10-11T10:54:54Z","title":"MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model","summary":" Multimodal semantic understanding often has to deal with uncertainty, which\nmeans the obtained messages tend to refer to multiple targets. Such uncertainty\nis problematic for our interpretation, including inter- and intra-modal\nuncertainty. Little effort has studied the modeling of this uncertainty,\nparticularly in pre-training on unlabeled datasets and fine-tuning in\ntask-specific downstream datasets. In this paper, we project the\nrepresentations of all modalities as probabilistic distributions via a\nProbability Distribution Encoder (PDE) by utilizing sequence-level\ninteractions. Compared to the existing deterministic methods, such uncertainty\nmodeling can convey richer multimodal semantic information and more complex\nrelationships. Furthermore, we integrate uncertainty modeling with popular\npre-training frameworks and propose suitable pre-training tasks:\nDistribution-based Vision-Language Contrastive learning (D-VLC),\nDistribution-based Masked Language Modeling (D-MLM), and Distribution-based\nImage-Text Matching (D-ITM). The fine-tuned models are applied to challenging\ndownstream tasks, including image-text retrieval, visual question answering,\nvisual reasoning, and visual entailment, and achieve state-of-the-art results.\n","authors":["Yatai Ji","Junjie Wang","Yuan Gong","Lin Zhang","Yanru Zhu","Hongfa Wang","Jiaxing Zhang","Tetsuya Sakai","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05335v3.pdf","comment":"CVPR 2023 Main Track Long Paper"},{"id":"http://arxiv.org/abs/2307.10982v1","updated":"2023-07-20T16:09:57Z","published":"2023-07-20T16:09:57Z","title":"MASR: Metadata Aware Speech Representation","summary":" In the recent years, speech representation learning is constructed primarily\nas a self-supervised learning (SSL) task, using the raw audio signal alone,\nwhile ignoring the side-information that is often available for a given speech\nrecording. In this paper, we propose MASR, a Metadata Aware Speech\nRepresentation learning framework, which addresses the aforementioned\nlimitations. MASR enables the inclusion of multiple external knowledge sources\nto enhance the utilization of meta-data information. The external knowledge\nsources are incorporated in the form of sample-level pair-wise similarity\nmatrices that are useful in a hard-mining loss. A key advantage of the MASR\nframework is that it can be combined with any choice of SSL method. Using MASR\nrepresentations, we perform evaluations on several downstream tasks such as\nlanguage identification, speech recognition and other non-semantic tasks such\nas speaker and emotion recognition. In these experiments, we illustrate\nsignificant performance improvements for the MASR over other established\nbenchmarks. We perform a detailed analysis on the language identification task\nto provide insights on how the proposed loss function enables the\nrepresentations to separate closely related languages.\n","authors":["Anjali Raj","Shikhar Bharadwaj","Sriram Ganapathy","Min Ma","Shikhar Vashishth"],"pdf_url":"https://arxiv.org/pdf/2307.10982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12619v2","updated":"2023-07-20T16:04:19Z","published":"2023-06-22T01:14:47Z","title":"Class-Incremental Learning based on Label Generation","summary":" Despite the great success of pre-trained language models, it is still a\nchallenge to use these models for continual learning, especially for the\nclass-incremental learning (CIL) setting due to catastrophic forgetting (CF).\nThis paper reports our finding that if we formulate CIL as a continual label\ngeneration problem, CF is drastically reduced and the generalizable\nrepresentations of pre-trained models can be better retained. We thus propose a\nnew CIL method (VAG) that also leverages the sparsity of vocabulary to focus\nthe generation and creates pseudo-replay samples by using label semantics.\nExperimental results show that VAG outperforms baselines by a large margin.\n","authors":["Yijia Shao","Yiduo Guo","Dongyan Zhao","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.12619v2.pdf","comment":"12 pages, ACL 2023 Main Conference"},{"id":"http://arxiv.org/abs/2306.14192v2","updated":"2023-07-20T15:20:51Z","published":"2023-06-25T10:16:49Z","title":"$α$-$β$-Factorization and the Binary Case of Simon's Congruence","summary":" In 1991 H\\'ebrard introduced a factorization of words that turned out to be a\npowerful tool for the investigation of a word's scattered factors (also known\nas (scattered) subwords or subsequences). Based on this, first Karandikar and\nSchnoebelen introduced the notion of $k$-richness and later on Barker et al.\nthe notion of $k$-universality. In 2022 Fleischmann et al. presented a\ngeneralization of the arch factorization by intersecting the arch factorization\nof a word and its reverse. While the authors merely used this factorization for\nthe investigation of shortest absent scattered factors, in this work we\ninvestigate this new $\\alpha$-$\\beta$-factorization as such. We characterize\nthe famous Simon congruence of $k$-universal words in terms of $1$-universal\nwords. Moreover, we apply these results to binary words. In this special case,\nwe obtain a full characterization of the classes and calculate the index of the\ncongruence. Lastly, we start investigating the ternary case, present a full\nlist of possibilities for $\\alpha\\beta\\alpha$-factors, and characterize their\ncongruence.\n","authors":["Pamela Fleischmann","Jonas Höfer","Annika Huch","Dirk Nowotka"],"pdf_url":"https://arxiv.org/pdf/2306.14192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10932v1","updated":"2023-07-20T15:02:42Z","published":"2023-07-20T15:02:42Z","title":"Identical and Fraternal Twins: Fine-Grained Semantic Contrastive\n Learning of Sentence Representations","summary":" The enhancement of unsupervised learning of sentence representations has been\nsignificantly achieved by the utility of contrastive learning. This approach\nclusters the augmented positive instance with the anchor instance to create a\ndesired embedding space. However, relying solely on the contrastive objective\ncan result in sub-optimal outcomes due to its inability to differentiate subtle\nsemantic variations between positive pairs. Specifically, common data\naugmentation techniques frequently introduce semantic distortion, leading to a\nsemantic margin between the positive pair. While the InfoNCE loss function\noverlooks the semantic margin and prioritizes similarity maximization between\npositive pairs during training, leading to the insensitive semantic\ncomprehension ability of the trained model. In this paper, we introduce a novel\nIdentical and Fraternal Twins of Contrastive Learning (named IFTCL) framework,\ncapable of simultaneously adapting to various positive pairs generated by\ndifferent augmentation techniques. We propose a \\textit{Twins Loss} to preserve\nthe innate margin during training and promote the potential of data enhancement\nin order to overcome the sub-optimal issue. We also present proof-of-concept\nexperiments combined with the contrastive objective to prove the validity of\nthe proposed Twins Loss. Furthermore, we propose a hippocampus queue mechanism\nto restore and reuse the negative instances without additional calculation,\nwhich further enhances the efficiency and performance of the IFCL. We verify\nthe IFCL framework on nine semantic textual similarity tasks with both English\nand Chinese datasets, and the experimental results show that IFCL outperforms\nstate-of-the-art methods.\n","authors":["Qingfa Xiao","Shuangyin Li","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.10932v1.pdf","comment":"This article has been accepted for publication in European Conference\n on Artificial Intelligence (ECAI2023). 9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.10930v1","updated":"2023-07-20T14:59:02Z","published":"2023-07-20T14:59:02Z","title":"MediaGPT : A Large Language Model Target Chinese Media","summary":" The development of large language models (LLMs) has seen rapid progress in\nrecent years. One of the most widely used LLMs is the Generative Pre-trained\nTransformer (GPT) series, which has been applied in various fields, including\nthe media domain. However, in practical applications, the differences between\nthe media's use cases and the general-purpose applications of LLMs have become\nincreasingly apparent, especially Chinese. As a result, there is a growing need\nto develop LLM that are specifically tailored to the unique requirements of the\nmedia domain. In this paper, we present MediaGPT, a large language model\ntraining on variety of media data and addressing the practical needs of Chinese\nmedia. We have designed a diverse set of task instruction types to cater to the\nspecific requirements of the domain. To further validate the effectiveness of\nour proposed LLM, we have constructed unique datasets that are tailored to the\nmedia domain and have also developed verification methods that are specifically\ndesigned for generative-type tasks. By doing so, we aim to bridge the gap\nbetween the general-purpose LLM and the requirements of the media domain, and\nto pave the way for more effective and efficient use of LLM in this field. This\npaper aims to explore the challenges and opportunities of developing LLM for\nmedia applications and to propose potential solutions for addressing these\nchallenges.\n","authors":["Zhonghao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10928v1","updated":"2023-07-20T14:56:35Z","published":"2023-07-20T14:56:35Z","title":"FLASK: Fine-grained Language Model Evaluation based on Alignment Skill\n Sets","summary":" Evaluation of Large Language Models (LLMs) is challenging because aligning to\nhuman values requires the composition of multiple skills and the required set\nof skills varies depending on the instruction. Recent studies have evaluated\nthe performance of LLMs in two ways, (1) automatic evaluation on several\nindependent benchmarks and (2) human or machined-based evaluation giving an\noverall score to the response. However, both settings are coarse-grained\nevaluations, not considering the nature of user instructions that require\ninstance-wise skill composition, which limits the interpretation of the true\ncapabilities of LLMs. In this paper, we introduce FLASK (Fine-grained Language\nModel Evaluation based on Alignment SKill Sets), a fine-grained evaluation\nprotocol that can be used for both model-based and human-based evaluation which\ndecomposes coarse-level scoring to an instance-wise skill set-level.\nSpecifically, we define 12 fine-grained skills needed for LLMs to follow\nopen-ended user instructions and construct an evaluation set by allocating a\nset of skills for each instance. Additionally, by annotating the target domains\nand difficulty level for each instance, FLASK provides a holistic view with a\ncomprehensive analysis of a model's performance depending on skill, domain, and\ndifficulty. Through using FLASK, we compare multiple open-sourced and\nproprietary LLMs and observe highly-correlated findings between model-based and\nhuman-based evaluations. FLASK enables developers to more accurately measure\nthe model performance and how it can be improved by analyzing factors that make\nLLMs proficient in particular skills. For practitioners, FLASK can be used to\nrecommend suitable models for particular situations through comprehensive\ncomparison among various LLMs. We release the evaluation data and code\nimplementation at https://github.com/kaistAI/FLASK.\n","authors":["Seonghyeon Ye","Doyoung Kim","Sungdong Kim","Hyeonbin Hwang","Seungone Kim","Yongrae Jo","James Thorne","Juho Kim","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2307.10928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14030v2","updated":"2023-07-20T13:54:05Z","published":"2023-06-24T18:17:38Z","title":"My Boli: Code-mixed Marathi-English Corpora, Pretrained Language Models\n and Evaluation Benchmarks","summary":" The research on code-mixed data is limited due to the unavailability of\ndedicated code-mixed datasets and pre-trained language models. In this work, we\nfocus on the low-resource Indian language Marathi which lacks any prior work in\ncode-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English\n(Mr-En) corpus with 10 million social media sentences for pretraining. We also\nrelease L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models\npre-trained on MeCorpus. Furthermore, for benchmarking, we present three\nsupervised datasets MeHate, MeSent, and MeLID for downstream tasks like\ncode-mixed Mr-En hate speech detection, sentiment analysis, and language\nidentification respectively. These evaluation datasets individually consist of\nmanually annotated \\url{~}12,000 Marathi-English code-mixed tweets. Ablations\nshow that the models trained on this novel corpus significantly outperform the\nexisting state-of-the-art BERT models. This is the first work that presents\nartifacts for code-mixed Marathi research. All datasets and models are publicly\nreleased at https://github.com/l3cube-pune/MarathiNLP .\n","authors":["Tanmay Chavan","Omkar Gokhale","Aditya Kane","Shantanu Patankar","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2306.14030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10867v1","updated":"2023-07-20T13:40:22Z","published":"2023-07-20T13:40:22Z","title":"FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with\n Human Feedback","summary":" Captions are crucial for understanding scientific visualizations and\ndocuments. Existing captioning methods for scientific figures rely on\nfigure-caption pairs extracted from documents for training, many of which fall\nshort with respect to metrics like helpfulness, explainability, and\nvisual-descriptiveness [15] leading to generated captions being misaligned with\nreader preferences. To enable the generation of high-quality figure captions,\nwe introduce FigCaps-HF a new framework for figure-caption generation that can\nincorporate domain expert feedback in generating captions optimized for reader\npreferences. Our framework comprises of 1) an automatic method for evaluating\nquality of figure-caption pairs, 2) a novel reinforcement learning with human\nfeedback (RLHF) method to optimize a generative figure-to-caption model for\nreader preferences. We demonstrate the effectiveness of our simple learning\nframework by improving performance over standard fine-tuning across different\ntypes of models. In particular, when using BLIP as the base model, our RLHF\nframework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and\nMeteor, respectively. Finally, we release a large-scale benchmark dataset with\nhuman feedback on figure-caption pairs to enable further evaluation and\ndevelopment of RLHF techniques for this problem.\n","authors":["Ashish Singh","Prateek Agarwal","Zixuan Huang","Arpita Singh","Tong Yu","Sungchul Kim","Victor Bursztyn","Nikos Vlassis","Ryan A. Rossi"],"pdf_url":"https://arxiv.org/pdf/2307.10867v1.pdf","comment":"19 pages, 4 figures. Benchmark Documentation:\n https://figcapshf.github.io/"},{"id":"http://arxiv.org/abs/2307.10864v1","updated":"2023-07-20T13:33:28Z","published":"2023-07-20T13:33:28Z","title":"Divide & Bind Your Attention for Improved Generative Semantic Nursing","summary":" Emerging large-scale text-to-image generative models, e.g., Stable Diffusion\n(SD), have exhibited overwhelming results with high fidelity. Despite the\nmagnificent progress, current state-of-the-art models still struggle to\ngenerate images fully adhering to the input prompt. Prior work, Attend &\nExcite, has introduced the concept of Generative Semantic Nursing (GSN), aiming\nto optimize cross-attention during inference time to better incorporate the\nsemantics. It demonstrates promising results in generating simple prompts,\ne.g., ``a cat and a dog''. However, its efficacy declines when dealing with\nmore complex prompts, and it does not explicitly address the problem of\nimproper attribute binding. To address the challenges posed by complex prompts\nor scenarios involving multiple entities and to achieve improved attribute\nbinding, we propose Divide & Bind. We introduce two novel loss objectives for\nGSN: a novel attendance loss and a binding loss. Our approach stands out in its\nability to faithfully synthesize desired objects with improved attribute\nalignment from complex prompts and exhibits superior performance across\nmultiple evaluation benchmarks. More videos and updates can be found on the\nproject page \\url{https://sites.google.com/view/divide-and-bind}.\n","authors":["Yumeng Li","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2307.10864v1.pdf","comment":"Project page: \\url{https://sites.google.com/view/divide-and-bind}"},{"id":"http://arxiv.org/abs/2305.01146v3","updated":"2023-07-20T13:10:07Z","published":"2023-05-02T01:33:02Z","title":"RadAdapt: Radiology Report Summarization via Lightweight Domain\n Adaptation of Large Language Models","summary":" We systematically investigate lightweight strategies to adapt large language\nmodels (LLMs) for the task of radiology report summarization (RRS).\nSpecifically, we focus on domain adaptation via pretraining (on natural\nlanguage, biomedical text, or clinical text) and via discrete prompting or\nparameter-efficient fine-tuning. Our results consistently achieve best\nperformance by maximally adapting to the task via pretraining on clinical text\nand fine-tuning on RRS examples. Importantly, this method fine-tunes a mere\n0.32% of parameters throughout the model, in contrast to end-to-end fine-tuning\n(100% of parameters). Additionally, we study the effect of in-context examples\nand out-of-distribution (OOD) training before concluding with a radiologist\nreader study and qualitative analysis. Our findings highlight the importance of\ndomain adaptation in RRS and provide valuable insights toward developing\neffective natural language processing solutions for clinical tasks.\n","authors":["Dave Van Veen","Cara Van Uden","Maayane Attias","Anuj Pareek","Christian Bluethgen","Malgorzata Polacin","Wah Chiu","Jean-Benoit Delbrouck","Juan Manuel Zambrano Chaves","Curtis P. Langlotz","Akshay S. Chaudhari","John Pauly"],"pdf_url":"https://arxiv.org/pdf/2305.01146v3.pdf","comment":"12 pages, 10 figures. Published in ACL BioNLP. Compared to v1, v2\n includes minor edits and one additional figure in the appendix. Compared to\n v2, v3 includes a link to the project's GitHub repository"},{"id":"http://arxiv.org/abs/2307.10826v1","updated":"2023-07-20T12:41:35Z","published":"2023-07-20T12:41:35Z","title":"Yelp Reviews and Food Types: A Comparative Analysis of Ratings,\n Sentiments, and Topics","summary":" This study examines the relationship between Yelp reviews and food types,\ninvestigating how ratings, sentiments, and topics vary across different types\nof food. Specifically, we analyze how ratings and sentiments of reviews vary\nacross food types, cluster food types based on ratings and sentiments, infer\nreview topics using machine learning models, and compare topic distributions\namong different food types. Our analyses reveal that some food types have\nsimilar ratings, sentiments, and topics distributions, while others have\ndistinct patterns. We identify four clusters of food types based on ratings and\nsentiments and find that reviewers tend to focus on different topics when\nreviewing certain food types. These findings have important implications for\nunderstanding user behavior and cultural influence on digital media platforms\nand promoting cross-cultural understanding and appreciation.\n","authors":["Wenyu Liao","Yiqing Shi","Yujia Hu","Wei Quan"],"pdf_url":"https://arxiv.org/pdf/2307.10826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10814v1","updated":"2023-07-20T12:24:23Z","published":"2023-07-20T12:24:23Z","title":"Cross-Corpus Multilingual Speech Emotion Recognition: Amharic vs. Other\n Languages","summary":" In a conventional Speech emotion recognition (SER) task, a classifier for a\ngiven language is trained on a pre-existing dataset for that same language.\nHowever, where training data for a language does not exist, data from other\nlanguages can be used instead. We experiment with cross-lingual and\nmultilingual SER, working with Amharic, English, German and URDU. For Amharic,\nwe use our own publicly-available Amharic Speech Emotion Dataset (ASED). For\nEnglish, German and Urdu we use the existing RAVDESS, EMO-DB and URDU datasets.\nWe followed previous research in mapping labels for all datasets to just two\nclasses, positive and negative. Thus we can compare performance on different\nlanguages directly, and combine languages for training and testing. In\nExperiment 1, monolingual SER trials were carried out using three classifiers,\nAlexNet, VGGE (a proposed variant of VGG), and ResNet50. Results averaged for\nthe three models were very similar for ASED and RAVDESS, suggesting that\nAmharic and English SER are equally difficult. Similarly, German SER is more\ndifficult, and Urdu SER is easier. In Experiment 2, we trained on one language\nand tested on another, in both directions for each pair: Amharic<->German,\nAmharic<->English, and Amharic<->Urdu. Results with Amharic as target suggested\nthat using English or German as source will give the best result. In Experiment\n3, we trained on several non-Amharic languages and then tested on Amharic. The\nbest accuracy obtained was several percent greater than the best accuracy in\nExperiment 2, suggesting that a better result can be obtained when using two or\nthree non-Amharic languages for training than when using just one non-Amharic\nlanguage. Overall, the results suggest that cross-lingual and multilingual\ntraining can be an effective strategy for training a SER classifier when\nresources for a language are scarce.\n","authors":["Ephrem Afele Retta","Richard Sutcliffe","Jabar Mahmood","Michael Abebe Berwo","Eiad Almekhlafi","Sajjad Ahmed Khan","Shehzad Ashraf Chaudhry","Mustafa Mhamed","Jun Feng"],"pdf_url":"https://arxiv.org/pdf/2307.10814v1.pdf","comment":"16 pages, 9 tables, 5 figures"},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":" Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2307.10799v1","updated":"2023-07-20T12:01:40Z","published":"2023-07-20T12:01:40Z","title":"Layer-wise Representation Fusion for Compositional Generalization","summary":" Despite successes across a broad range of applications, sequence-to-sequence\nmodels' construct of solutions are argued to be less compositional than\nhuman-like generalization. There is mounting evidence that one of the reasons\nhindering compositional generalization is representations of the encoder and\ndecoder uppermost layer are entangled. In other words, the syntactic and\nsemantic representations of sequences are twisted inappropriately. However,\nmost previous studies mainly concentrate on enhancing token-level semantic\ninformation to alleviate the representations entanglement problem, rather than\ncomposing and using the syntactic and semantic representations of sequences\nappropriately as humans do. In addition, we explain why the entanglement\nproblem exists from the perspective of recent studies about training deeper\nTransformer, mainly owing to the ``shallow'' residual connections and its\nsimple, one-step operations, which fails to fuse previous layers' information\neffectively. Starting from this finding and inspired by humans' strategies, we\npropose \\textsc{FuSion} (\\textbf{Fu}sing \\textbf{S}yntactic and\nSemant\\textbf{i}c Representati\\textbf{on}s), an extension to\nsequence-to-sequence models to learn to fuse previous layers' information back\ninto the encoding and decoding process appropriately through introducing a\n\\emph{fuse-attention module} at each encoder and decoder layer. \\textsc{FuSion}\nachieves competitive and even \\textbf{state-of-the-art} results on two\nrealistic benchmarks, which empirically demonstrates the effectiveness of our\nproposal.\n","authors":["Yafang Zheng","Lei Lin","Zhaohong Lai","Binling Wang","Shan Liu","Biao Fu","Wenhao Rao","Peigen Ye","Yidong Chen","Xiaodong Shi"],"pdf_url":"https://arxiv.org/pdf/2307.10799v1.pdf","comment":"work in progress. arXiv admin note: substantial text overlap with\n arXiv:2305.12169"},{"id":"http://arxiv.org/abs/2210.11835v2","updated":"2023-07-20T11:56:40Z","published":"2022-10-21T09:28:54Z","title":"A Textless Metric for Speech-to-Speech Comparison","summary":" In this paper, we introduce a new and simple method for comparing speech\nutterances without relying on text transcripts. Our speech-to-speech comparison\nmetric utilizes state-of-the-art speech2unit encoders like HuBERT to convert\nspeech utterances into discrete acoustic units. We then propose a simple and\neasily replicable neural architecture that learns a speech-based metric that\nclosely corresponds to its text-based counterpart. This textless metric has\nnumerous potential applications, including evaluating speech-to-speech\ntranslation for oral languages, languages without dependable ASR systems, or to\navoid the need for ASR transcription altogether. This paper also shows that for\nspeech-to-speech translation evaluation, ASR-BLEU (which consists in\nautomatically transcribing both speech hypothesis and reference and compute\nsentence-level BLEU between transcripts) is a poor proxy to real text-BLEU even\nwhen ASR system is strong.\n","authors":["Laurent Besacier","Swen Ribeiro","Olivier Galibert","Ioan Calapodescu"],"pdf_url":"https://arxiv.org/pdf/2210.11835v2.pdf","comment":"link to supplementary material:\n https://github.com/besacier/textless-metric"},{"id":"http://arxiv.org/abs/2307.10778v1","updated":"2023-07-20T11:29:15Z","published":"2023-07-20T11:29:15Z","title":"Extreme Multi-Label Skill Extraction Training using Large Language\n Models","summary":" Online job ads serve as a valuable source of information for skill\nrequirements, playing a crucial role in labor market analysis and e-recruitment\nprocesses. Since such ads are typically formatted in free text, natural\nlanguage processing (NLP) technologies are required to automatically process\nthem. We specifically focus on the task of detecting skills (mentioned\nliterally, or implicitly described) and linking them to a large skill ontology,\nmaking it a challenging case of extreme multi-label classification (XMLC).\nGiven that there is no sizable labeled (training) dataset are available for\nthis specific XMLC task, we propose techniques to leverage general Large\nLanguage Models (LLMs). We describe a cost-effective approach to generate an\naccurate, fully synthetic labeled dataset for skill extraction, and present a\ncontrastive learning strategy that proves effective in the task. Our results\nacross three skill extraction benchmarks show a consistent increase of between\n15 to 25 percentage points in \\textit{R-Precision@5} compared to previously\npublished results that relied solely on distant supervision through literal\nmatches.\n","authors":["Jens-Joris Decorte","Severine Verlinden","Jeroen Van Hautte","Johannes Deleu","Chris Develder","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2307.10778v1.pdf","comment":"Accepted to the International workshop on AI for Human Resources and\n Public Employment Services (AI4HR&PES) as part of ECML-PKDD 2023"},{"id":"http://arxiv.org/abs/2305.15299v2","updated":"2023-07-20T10:43:57Z","published":"2023-05-24T16:23:46Z","title":"Science in the Era of ChatGPT, Large Language Models and Generative AI:\n Challenges for Research Ethics and How to Respond","summary":" Large language models of artificial intelligence (AI), such as ChatGPT, find\nremarkable but controversial applicability in science and research. This paper\nreviews epistemological challenges, ethical and integrity risks in science\nconduct in the advent of generative AI. This is with the aim to lay new timely\nfoundations for a high-quality research ethics review. The role of AI language\nmodels as a research instrument and subject is scrutinized along with ethical\nimplications for scientists, participants and reviewers. New emerging practices\nfor research ethics review are discussed, concluding with ten recommendations\nthat shape a response for a more responsible research conduct in the era of AI.\n","authors":["Evangelos Pournaras"],"pdf_url":"https://arxiv.org/pdf/2305.15299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10757v1","updated":"2023-07-20T10:42:16Z","published":"2023-07-20T10:42:16Z","title":"Vesper: A Compact and Effective Pretrained Model for Speech Emotion\n Recognition","summary":" This paper presents a paradigm that adapts general large-scale pretrained\nmodels (PTMs) to speech emotion recognition task. Although PTMs shed new light\non artificial general intelligence, they are constructed with general tasks in\nmind, and thus, their efficacy for specific tasks can be further improved.\nAdditionally, employing PTMs in practical applications can be challenging due\nto their considerable size. Above limitations spawn another research direction,\nnamely, optimizing large-scale PTMs for specific tasks to generate\ntask-specific PTMs that are both compact and effective. In this paper, we focus\non the speech emotion recognition task and propose an improved emotion-specific\npretrained encoder called Vesper. Vesper is pretrained on a speech dataset\nbased on WavLM and takes into account emotional characteristics. To enhance\nsensitivity to emotional information, Vesper employs an emotion-guided masking\nstrategy to identify the regions that need masking. Subsequently, Vesper\nemploys hierarchical and cross-layer self-supervision to improve its ability to\ncapture acoustic and semantic representations, both of which are crucial for\nemotion recognition. Experimental results on the IEMOCAP, MELD, and CREMA-D\ndatasets demonstrate that Vesper with 4 layers outperforms WavLM Base with 12\nlayers, and the performance of Vesper with 12 layers surpasses that of WavLM\nLarge with 24 layers.\n","authors":["Weidong Chen","Xiaofen Xing","Peihao Chen","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2307.10757v1.pdf","comment":"13 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.10751v1","updated":"2023-07-20T10:26:57Z","published":"2023-07-20T10:26:57Z","title":"Exploring Perspectives on the Impact of Artificial Intelligence on the\n Creativity of Knowledge Work: Beyond Mechanised Plagiarism and Stochastic\n Parrots","summary":" Artificial Intelligence (AI), and in particular generative models, are\ntransformative tools for knowledge work. They problematise notions of\ncreativity, originality, plagiarism, the attribution of credit, and copyright\nownership. Critics of generative models emphasise the reliance on large amounts\nof training data, and view the output of these models as no more than\nrandomised plagiarism, remix, or collage of the source data. On these grounds,\nmany have argued for stronger regulations on the deployment, use, and\nattribution of the output of these models. However, these issues are not new or\nunique to artificial intelligence. In this position paper, using examples from\nliterary criticism, the history of art, and copyright law, I show how\ncreativity and originality resist definition as a notatable or\ninformation-theoretic property of an object, and instead can be seen as the\nproperty of a process, an author, or a viewer. Further alternative views hold\nthat all creative work is essentially reuse (mostly without attribution), or\nthat randomness itself can be creative. I suggest that creativity is ultimately\ndefined by communities of creators and receivers, and the deemed sources of\ncreativity in a workflow often depend on which parts of the workflow can be\nautomated. Using examples from recent studies of AI in creative knowledge work,\nI suggest that AI shifts knowledge work from material production to critical\nintegration. This position paper aims to begin a conversation around a more\nnuanced approach to the problems of creativity and credit assignment for\ngenerative models, one which more fully recognises the importance of the\ncreative and curatorial voice of the users of these models and moves away from\nsimpler notational or information-theoretic views.\n","authors":["Advait Sarkar"],"pdf_url":"https://arxiv.org/pdf/2307.10751v1.pdf","comment":"Advait Sarkar. 2023. Exploring Perspectives on the Impact of\n Artificial Intelligence on the Creativity of Knowledge Work Beyond Mechanised\n Plagiarism and Stochastic Parrots. In Annual Symposium on Human-Computer\n Interaction for Work 2023 (CHIWORK 2023), June 13-16, 2023, Oldenburg,\n Germany. ACM, New York, NY, USA, 17 pages"},{"id":"http://arxiv.org/abs/2301.11596v4","updated":"2023-07-20T08:58:12Z","published":"2023-01-27T08:45:53Z","title":"ThoughtSource: A central hub for large language model reasoning data","summary":" Large language models (LLMs) such as GPT-4 have recently demonstrated\nimpressive results across a wide range of tasks. LLMs are still limited,\nhowever, in that they frequently fail at complex reasoning, their reasoning\nprocesses are opaque, they are prone to 'hallucinate' facts, and there are\nconcerns about their underlying biases. Letting models verbalize reasoning\nsteps as natural language, a technique known as chain-of-thought prompting, has\nrecently been proposed as a way to address some of these issues. Here we\npresent ThoughtSource, a meta-dataset and software library for chain-of-thought\n(CoT) reasoning. The goal of ThoughtSource is to improve future artificial\nintelligence systems by facilitating qualitative understanding of CoTs,\nenabling empirical evaluations, and providing training data. This first release\nof ThoughtSource integrates six scientific/medical, three general-domain and\nfive math word question answering datasets.\n","authors":["Simon Ott","Konstantin Hebenstreit","Valentin Liévin","Christoffer Egeberg Hother","Milad Moradi","Maximilian Mayrhauser","Robert Praas","Ole Winther","Matthias Samwald"],"pdf_url":"https://arxiv.org/pdf/2301.11596v4.pdf","comment":"Revision: added datasets, formatting"},{"id":"http://arxiv.org/abs/2011.00696v2","updated":"2023-07-20T08:56:26Z","published":"2020-11-02T03:07:38Z","title":"ABNIRML: Analyzing the Behavior of Neural IR Models","summary":" Pretrained contextualized language models such as BERT and T5 have\nestablished a new state-of-the-art for ad-hoc search. However, it is not yet\nwell-understood why these methods are so effective, what makes some variants\nmore effective than others, and what pitfalls they may have. We present a new\ncomprehensive framework for Analyzing the Behavior of Neural IR ModeLs\n(ABNIRML), which includes new types of diagnostic probes that allow us to test\nseveral characteristics -- such as writing styles, factuality, sensitivity to\nparaphrasing and word order -- that are not addressed by previous techniques.\nTo demonstrate the value of the framework, we conduct an extensive empirical\nstudy that yields insights into the factors that contribute to the neural\nmodel's gains, and identify potential unintended biases the models exhibit.\nSome of our results confirm conventional wisdom, like that recent neural\nranking models rely less on exact term overlap with the query, and instead\nleverage richer linguistic information, evidenced by their higher sensitivity\nto word and sentence order. Other results are more surprising, such as that\nsome models (e.g., T5 and ColBERT) are biased towards factually correct (rather\nthan simply relevant) texts. Further, some characteristics vary even for the\nsame base language model, and other characteristics can appear due to random\nvariations during model training.\n","authors":["Sean MacAvaney","Sergey Feldman","Nazli Goharian","Doug Downey","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2011.00696v2.pdf","comment":"TACL version"},{"id":"http://arxiv.org/abs/2306.06427v2","updated":"2023-07-20T08:47:14Z","published":"2023-06-10T12:42:36Z","title":"Boosting Language Models Reasoning with Chain-of-Knowledge Prompting","summary":" Recently, Chain-of-Thought (CoT) prompting has delivered success on complex\nreasoning tasks, which aims at designing a simple prompt like ``Let's think\nstep by step'' or multiple in-context exemplars with well-designed rationales\nto elicit Large Language Models (LLMs) to generate intermediate reasoning\nsteps. However, the generated rationales often come with mistakes, making\nunfactual and unfaithful reasoning chains. To mitigate this brittleness, we\npropose a novel Chain-of-Knowledge (CoK) prompting, where we aim at eliciting\nLLMs to generate explicit pieces of knowledge evidence in the form of structure\ntriple. This is inspired by our human behaviors, i.e., we can draw a mind map\nor knowledge map as the reasoning evidence in the brain before answering a\ncomplex question. Benefiting from CoK, we additionally introduce a\nF^2-Verification method to estimate the reliability of the reasoning chains in\nterms of factuality and faithfulness. For the unreliable response, the wrong\nevidence can be indicated to prompt the LLM to rethink. Extensive experiments\ndemonstrate that our method can further improve the performance of commonsense,\nfactual, symbolic, and arithmetic reasoning tasks.\n","authors":["Jianing Wang","Qiushi Sun","Nuo Chen","Xiang Li","Ming Gao"],"pdf_url":"https://arxiv.org/pdf/2306.06427v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.10700v1","updated":"2023-07-20T08:45:00Z","published":"2023-07-20T08:45:00Z","title":"Large language models shape and are shaped by society: A survey of arXiv\n publication patterns","summary":" There has been a steep recent increase in the number of large language model\n(LLM) papers, producing a dramatic shift in the scientific landscape which\nremains largely undocumented through bibliometric analysis. Here, we analyze\n388K papers posted on the CS and Stat arXivs, focusing on changes in\npublication patterns in 2023 vs. 2018-2022. We analyze how the proportion of\nLLM papers is increasing; the LLM-related topics receiving the most attention;\nthe authors writing LLM papers; how authors' research topics correlate with\ntheir backgrounds; the factors distinguishing highly cited LLM papers; and the\npatterns of international collaboration. We show that LLM research increasingly\nfocuses on societal impacts: there has been an 18x increase in the proportion\nof LLM-related papers on the Computers and Society sub-arXiv, and authors newly\npublishing on LLMs are more likely to focus on applications and societal\nimpacts than more experienced authors. LLM research is also shaped by social\ndynamics: we document gender and academic/industry disparities in the topics\nLLM authors focus on, and a US/China schism in the collaboration network.\nOverall, our analysis documents the profound ways in which LLM research both\nshapes and is shaped by society, attesting to the necessity of sociotechnical\nlenses.\n","authors":["Rajiv Movva","Sidhika Balachandar","Kenny Peng","Gabriel Agostini","Nikhil Garg","Emma Pierson"],"pdf_url":"https://arxiv.org/pdf/2307.10700v1.pdf","comment":"Working paper"},{"id":"http://arxiv.org/abs/2303.12112v3","updated":"2023-07-20T08:16:09Z","published":"2023-03-21T18:03:14Z","title":"Positive-Augmented Contrastive Learning for Image and Video Captioning\n Evaluation","summary":" The CLIP model has been recently proven to be very effective for a variety of\ncross-modal tasks, including the evaluation of captions generated from\nvision-and-language architectures. In this paper, we propose a new recipe for a\ncontrastive-based evaluation metric for image captioning, namely\nPositive-Augmented Contrastive learning Score (PAC-S), that in a novel way\nunifies the learning of a contrastive visual-semantic space with the addition\nof generated images and text on curated data. Experiments spanning several\ndatasets demonstrate that our new metric achieves the highest correlation with\nhuman judgments on both images and videos, outperforming existing\nreference-based metrics like CIDEr and SPICE and reference-free metrics like\nCLIP-Score. Finally, we test the system-level correlation of the proposed\nmetric when considering popular image captioning approaches, and assess the\nimpact of employing different cross-modal features. Our source code and trained\nmodels are publicly available at: https://github.com/aimagelab/pacscore.\n","authors":["Sara Sarto","Manuele Barraco","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2303.12112v3.pdf","comment":"CVPR 2023 (highlight paper)"},{"id":"http://arxiv.org/abs/2307.10666v1","updated":"2023-07-20T07:47:08Z","published":"2023-07-20T07:47:08Z","title":"A Dataset and Strong Baselines for Classification of Czech News Texts","summary":" Pre-trained models for Czech Natural Language Processing are often evaluated\non purely linguistic tasks (POS tagging, parsing, NER) and relatively simple\nclassification tasks such as sentiment classification or article classification\nfrom a single news source. As an alternative, we present\nCZEch~NEws~Classification~dataset (CZE-NEC), one of the largest Czech\nclassification datasets, composed of news articles from various sources\nspanning over twenty years, which allows a more rigorous evaluation of such\nmodels. We define four classification tasks: news source, news category,\ninferred author's gender, and day of the week. To verify the task difficulty,\nwe conducted a human evaluation, which revealed that human performance lags\nbehind strong machine-learning baselines built upon pre-trained transformer\nmodels. Furthermore, we show that language-specific pre-trained encoder\nanalysis outperforms selected commercially available large-scale generative\nlanguage models.\n","authors":["Hynek Kydlíček","Jindřich Libovický"],"pdf_url":"https://arxiv.org/pdf/2307.10666v1.pdf","comment":"12 pages, Accepted to Text, Speech and Dialogue (TSD) 2023"},{"id":"http://arxiv.org/abs/2307.10652v1","updated":"2023-07-20T07:33:30Z","published":"2023-07-20T07:33:30Z","title":"Exploring the Landscape of Natural Language Processing Research","summary":" As an efficient approach to understand, generate, and process natural\nlanguage texts, research in natural language processing (NLP) has exhibited a\nrapid spread and wide adoption in recent years. Given the increasing amount of\nresearch work in this area, several NLP-related approaches have been surveyed\nin the research community. However, a comprehensive study that categorizes\nestablished topics, identifies trends, and outlines areas for future research\nremains absent to this day. Contributing to closing this gap, we have\nsystematically classified and analyzed research papers included in the ACL\nAnthology. As a result, we present a structured overview of the research\nlandscape, provide a taxonomy of fields-of-study in NLP, analyze recent\ndevelopments in NLP, summarize our findings, and highlight directions for\nfuture work.\n","authors":["Tim Schopf","Karim Arabi","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.10652v1.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2307.10635v1","updated":"2023-07-20T07:01:57Z","published":"2023-07-20T07:01:57Z","title":"SciBench: Evaluating College-Level Scientific Problem-Solving Abilities\n of Large Language Models","summary":" Recent advances in large language models (LLMs) have demonstrated notable\nprogress on many mathematical benchmarks. However, most of these benchmarks\nonly feature problems grounded in junior and senior high school subjects,\ncontain only multiple-choice questions, and are confined to a limited scope of\nelementary arithmetic operations. To address these issues, this paper\nintroduces an expansive benchmark suite SciBench that aims to systematically\nexamine the reasoning capabilities required for complex scientific problem\nsolving. SciBench contains two carefully curated datasets: an open set\nfeaturing a range of collegiate-level scientific problems drawn from\nmathematics, chemistry, and physics textbooks, and a closed set comprising\nproblems from undergraduate-level exams in computer science and mathematics.\nBased on the two datasets, we conduct an in-depth benchmark study of two\nrepresentative LLMs with various prompting strategies. The results reveal that\ncurrent LLMs fall short of delivering satisfactory performance, with an overall\nscore of merely 35.80%. Furthermore, through a detailed user study, we\ncategorize the errors made by LLMs into ten problem-solving abilities. Our\nanalysis indicates that no single prompting strategy significantly outperforms\nothers and some strategies that demonstrate improvements in certain\nproblem-solving skills result in declines in other skills. We envision that\nSciBench will catalyze further developments in the reasoning abilities of LLMs,\nthereby ultimately contributing to scientific research and discovery.\n","authors":["Xiaoxuan Wang","Ziniu Hu","Pan Lu","Yanqiao Zhu","Jieyu Zhang","Satyen Subramaniam","Arjun R. Loomba","Shichang Zhang","Yizhou Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10635v1.pdf","comment":"Work in progress, 18 pages"},{"id":"http://arxiv.org/abs/2307.10634v1","updated":"2023-07-20T06:59:02Z","published":"2023-07-20T06:59:02Z","title":"Generative Language Models on Nucleotide Sequences of Human Genes","summary":" Language models, primarily transformer-based ones, obtained colossal success\nin NLP. To be more precise, studies like BERT in NLU and works such as GPT-3\nfor NLG are very crucial. DNA sequences are very close to natural language in\nterms of structure, so if the DNA-related bioinformatics domain is concerned,\ndiscriminative models, like DNABert, exist. Yet, the generative side of the\ncoin is mainly unexplored to the best of our knowledge. Consequently, we\nfocused on developing an autoregressive generative language model like GPT-3\nfor DNA sequences. Because working with whole DNA sequences is challenging\nwithout substantial computational resources, we decided to carry out our study\non a smaller scale, focusing on nucleotide sequences of human genes, unique\nparts in DNA with specific functionalities, instead of the whole DNA. This\ndecision did not change the problem structure a lot due to the fact that both\nDNA and genes can be seen as 1D sequences consisting of four different\nnucleotides without losing much information and making too much simplification.\nFirst of all, we systematically examined an almost entirely unexplored problem\nand observed that RNNs performed the best while simple techniques like N-grams\nwere also promising. Another beneficial point was learning how to work with\ngenerative models on languages we do not understand, unlike natural language.\nHow essential using real-life tasks beyond the classical metrics such as\nperplexity is observed. Furthermore, checking whether the data-hungry nature of\nthese models can be changed through selecting a language with minimal\nvocabulary size, four owing to four different types of nucleotides, is\nexamined. The reason for reviewing this was that choosing such a language might\nmake the problem easier. However, what we observed in this study was it did not\nprovide that much of a change in the amount of data needed.\n","authors":["Musa Nuri Ihtiyar","Arzucan Ozgur"],"pdf_url":"https://arxiv.org/pdf/2307.10634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10633v1","updated":"2023-07-20T06:58:55Z","published":"2023-07-20T06:58:55Z","title":"Multi-Method Self-Training: Improving Code Generation With Text, And\n Vice Versa","summary":" Large Language Models have many methods for solving the same problem. This\nintroduces novel strengths (different methods may work well for different\nproblems) and weaknesses (it may be difficult for users to know which method to\nuse). In this paper, we introduce Multi-Method Self-Training (MMST), where one\nmethod is trained on the filtered outputs of another, allowing us to augment\nthe strengths and ameliorate the weaknesses of each method. Using a 176B\nparameter model trained on both language and code, we show that MMST can 1)\nimprove the less performant method (up to 30%) making the model easier to use,\n2) improve the more performant method (up to 32.2%) making the model more\nperformant, and 3) improve the performance of related but distinct tasks (up to\n10.3%) by improving the ability of the model to generate rationales. We then\nconduct ablation analyses to explore why MMST works. We show that MMST\ngenerates more data than traditional self-training, but the improvement in\nperformance is driven by the use of multiple methods. We also analyze\nprompt-engineering and anti-correlated performance between methods as means of\nmaking MMST more effective. We hope the evidence from our paper motivates\nmachine learning researchers to explore ways in which advances in language\nmodels allow for new forms of training.\n","authors":["Shriyash K. Upadhyay","Etan J. Ginsberg"],"pdf_url":"https://arxiv.org/pdf/2307.10633v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.10587v1","updated":"2023-07-20T05:03:00Z","published":"2023-07-20T05:03:00Z","title":"A Deep Dive into the Disparity of Word Error Rates Across Thousands of\n NPTEL MOOC Videos","summary":" Automatic speech recognition (ASR) systems are designed to transcribe spoken\nlanguage into written text and find utility in a variety of applications\nincluding voice assistants and transcription services. However, it has been\nobserved that state-of-the-art ASR systems which deliver impressive benchmark\nresults, struggle with speakers of certain regions or demographics due to\nvariation in their speech properties. In this work, we describe the curation of\na massive speech dataset of 8740 hours consisting of $\\sim9.8$K technical\nlectures in the English language along with their transcripts delivered by\ninstructors representing various parts of Indian demography. The dataset is\nsourced from the very popular NPTEL MOOC platform. We use the curated dataset\nto measure the existing disparity in YouTube Automatic Captions and OpenAI\nWhisper model performance across the diverse demographic traits of speakers in\nIndia. While there exists disparity due to gender, native region, age and\nspeech rate of speakers, disparity based on caste is non-existent. We also\nobserve statistically significant disparity across the disciplines of the\nlectures. These results indicate the need of more inclusive and robust ASR\nsystems and more representational datasets for disparity evaluation in them.\n","authors":["Anand Kumar Rai","Siddharth D Jaiswal","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2307.10587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10558v1","updated":"2023-07-20T03:54:24Z","published":"2023-07-20T03:54:24Z","title":"Instruction-following Evaluation through Verbalizer Manipulation","summary":" While instruction-tuned models have shown remarkable success in various\nnatural language processing tasks, accurately evaluating their ability to\nfollow instructions remains challenging. Existing benchmarks primarily focus on\ncommon instructions that align well with what the model learned during\ntraining. However, proficiency in responding to these instructions does not\nnecessarily imply strong ability in instruction following. In this paper, we\npropose a novel instruction-following evaluation protocol called verbalizer\nmanipulation. It instructs the model to verbalize the task label with words\naligning with model priors to different extents, adopting verbalizers from\nhighly aligned (e.g., outputting ``postive'' for positive sentiment), to\nminimally aligned (e.g., outputting ``negative'' for positive sentiment).\nVerbalizer manipulation can be seamlessly integrated with any classification\nbenchmark to examine the model's reliance on priors and its ability to override\nthem to accurately follow the instructions. We conduct a comprehensive\nevaluation of four major model families across nine datasets, employing twelve\nsets of verbalizers for each of them. We observe that the instruction-following\nabilities of models, across different families and scales, are significantly\ndistinguished by their performance on less natural verbalizers. Even the\nstrongest GPT-4 model struggles to perform better than random guessing on the\nmost challenging verbalizer, emphasizing the need for continued advancements to\nimprove their instruction-following abilities.\n","authors":["Shiyang Li","Jun Yan","Hai Wang","Zheng Tang","Xiang Ren","Vijay Srinivasan","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2307.10558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14795v2","updated":"2023-07-20T03:39:19Z","published":"2023-06-26T15:53:02Z","title":"MotionGPT: Human Motion as a Foreign Language","summary":" Though the advancement of pre-trained large language models unfolds, the\nexploration of building a unified model for language and other multi-modal\ndata, such as motion, remains challenging and untouched so far. Fortunately,\nhuman motion displays a semantic coupling akin to human language, often\nperceived as a form of body language. By fusing language data with large-scale\nmotion models, motion-language pre-training that can enhance the performance of\nmotion-related tasks becomes feasible. Driven by this insight, we propose\nMotionGPT, a unified, versatile, and user-friendly motion-language model to\nhandle multiple motion-relevant tasks. Specifically, we employ the discrete\nvector quantization for human motion and transfer 3D motion into motion tokens,\nsimilar to the generation process of word tokens. Building upon this \"motion\nvocabulary\", we perform language modeling on both motion and text in a unified\nmanner, treating human motion as a specific language. Moreover, inspired by\nprompt learning, we pre-train MotionGPT with a mixture of motion-language data\nand fine-tune it on prompt-based question-and-answer tasks. Extensive\nexperiments demonstrate that MotionGPT achieves state-of-the-art performances\non multiple motion tasks including text-driven motion generation, motion\ncaptioning, motion prediction, and motion in-between.\n","authors":["Biao Jiang","Xin Chen","Wen Liu","Jingyi Yu","Gang Yu","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.14795v2.pdf","comment":"Project Page: https://github.com/OpenMotionLab/MotionGPT"},{"id":"http://arxiv.org/abs/2307.10549v1","updated":"2023-07-20T03:26:57Z","published":"2023-07-20T03:26:57Z","title":"Dynamic Large Language Models on Blockchains","summary":" Training and deploying the large language models requires a large mount of\ncomputational resource because the language models contain billions of\nparameters and the text has thousands of tokens. Another problem is that the\nlarge language models are static. They are fixed after the training process. To\ntackle these issues, in this paper, we propose to train and deploy the dynamic\nlarge language model on blockchains, which have high computation performance\nand are distributed across a network of computers. A blockchain is a secure,\ndecentralized, and transparent system that allows for the creation of a\ntamper-proof ledger for transactions without the need for intermediaries. The\ndynamic large language models can continuously learn from the user input after\nthe training process. Our method provides a new way to develop the large\nlanguage models and also sheds a light on the next generation artificial\nintelligence systems.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2307.10549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00470v4","updated":"2023-07-20T03:03:25Z","published":"2023-07-02T04:32:41Z","title":"PatternGPT :A Pattern-Driven Framework for Large Language Model Text\n Generation","summary":" Large language models(LLMS)have shown excellent text generation capabilities,\ncapable of generating fluent human-like responses for many downstream tasks.\nHowever, applying large language models to real-world critical tasks remains\nchallenging due to their susceptibility to hallucinations and inability to\ndirectly use external knowledge. To cope with the above challenges, this paper\nproposes PatternGPT, a pattern-driven text generation framework for Large\nLanguage Models. Firstly, the framework utilizes the extraction capability of\nLarge Language Models to generate rich and diversified structured and\nformalized patterns, which facilitates the introduction of external knowledge\nto do the computation, and then draws on the idea of federated learning to use\nmultiple agents to achieve the sharing in order to obtain more diversified\npatterns, and finally uses judgment criteria and optimization algorithm to\nsearch for high-quality patterns to guide the generation of models. Finally,\nexternal knowledge such as judgment criteria and optimization algorithms are\nused to search for high-quality patterns, and the searched patterns are used to\nguide model generation. This framework has the advantages of generating\ndiversified patterns, protecting data privacy, combining external knowledge,\nand improving the quality of generation, which provides an effective method to\noptimize the text generation capability of large language models, and make it\nbetter applied to the field of intelligent dialogue and content generation.\n","authors":["Le Xiao","Xin Shan"],"pdf_url":"https://arxiv.org/pdf/2307.00470v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10168v2","updated":"2023-07-20T02:29:25Z","published":"2023-07-19T17:54:43Z","title":"LLMs as Workers in Human-Computational Algorithms? Replicating\n Crowdsourcing Pipelines with LLMs","summary":" LLMs have shown promise in replicating human-like behavior in crowdsourcing\ntasks that were previously thought to be exclusive to human abilities. However,\ncurrent efforts focus mainly on simple atomic tasks. We explore whether LLMs\ncan replicate more complex crowdsourcing pipelines. We find that modern LLMs\ncan simulate some of crowdworkers' abilities in these \"human computation\nalgorithms,\" but the level of success is variable and influenced by requesters'\nunderstanding of LLM capabilities, the specific skills required for sub-tasks,\nand the optimal interaction modality for performing these sub-tasks. We reflect\non human and LLMs' different sensitivities to instructions, stress the\nimportance of enabling human-facing safeguards for LLMs, and discuss the\npotential of training humans and LLMs with complementary skill sets. Crucially,\nwe show that replicating crowdsourcing pipelines offers a valuable platform to\ninvestigate (1) the relative strengths of LLMs on different tasks (by\ncross-comparing their performances on sub-tasks) and (2) LLMs' potential in\ncomplex tasks, where they can complete part of the tasks while leaving others\nto humans.\n","authors":["Tongshuang Wu","Haiyi Zhu","Maya Albayrak","Alexis Axon","Amanda Bertsch","Wenxing Deng","Ziqi Ding","Bill Guo","Sireesh Gururaja","Tzu-Sheng Kuo","Jenny T. Liang","Ryan Liu","Ihita Mandal","Jeremiah Milbauer","Xiaolin Ni","Namrata Padmanabhan","Subhashini Ramkumar","Alexis Sudjianto","Jordan Taylor","Ying-Jui Tseng","Patricia Vaidos","Zhijin Wu","Wei Wu","Chenyang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10168v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11296v2","updated":"2023-07-20T02:20:35Z","published":"2023-06-20T05:20:29Z","title":"ChatGPT Chemistry Assistant for Text Mining and Prediction of MOF\n Synthesis","summary":" We use prompt engineering to guide ChatGPT in the automation of text mining\nof metal-organic frameworks (MOFs) synthesis conditions from diverse formats\nand styles of the scientific literature. This effectively mitigates ChatGPT's\ntendency to hallucinate information -- an issue that previously made the use of\nLarge Language Models (LLMs) in scientific fields challenging. Our approach\ninvolves the development of a workflow implementing three different processes\nfor text mining, programmed by ChatGPT itself. All of them enable parsing,\nsearching, filtering, classification, summarization, and data unification with\ndifferent tradeoffs between labor, speed, and accuracy. We deploy this system\nto extract 26,257 distinct synthesis parameters pertaining to approximately 800\nMOFs sourced from peer-reviewed research articles. This process incorporates\nour ChemPrompt Engineering strategy to instruct ChatGPT in text mining,\nresulting in impressive precision, recall, and F1 scores of 90-99%.\nFurthermore, with the dataset built by text mining, we constructed a\nmachine-learning model with over 86% accuracy in predicting MOF experimental\ncrystallization outcomes and preliminarily identifying important factors in MOF\ncrystallization. We also developed a reliable data-grounded MOF chatbot to\nanswer questions on chemical reactions and synthesis procedures. Given that the\nprocess of using ChatGPT reliably mines and tabulates diverse MOF synthesis\ninformation in a unified format, while using only narrative language requiring\nno coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be\nvery useful across various other chemistry sub-disciplines.\n","authors":["Zhiling Zheng","Oufan Zhang","Christian Borgs","Jennifer T. Chayes","Omar M. Yaghi"],"pdf_url":"https://arxiv.org/pdf/2306.11296v2.pdf","comment":"Published on Journal of the American Chemical Society (2023); 102\n pages (18-page manuscript, 84 pages of supporting information)"},{"id":"http://arxiv.org/abs/2307.07946v2","updated":"2023-07-20T02:01:34Z","published":"2023-07-16T04:50:52Z","title":"Unifying Token and Span Level Supervisions for Few-Shot Sequence\n Labeling","summary":" Few-shot sequence labeling aims to identify novel classes based on only a few\nlabeled samples. Existing methods solve the data scarcity problem mainly by\ndesigning token-level or span-level labeling models based on metric learning.\nHowever, these methods are only trained at a single granularity (i.e., either\ntoken level or span level) and have some weaknesses of the corresponding\ngranularity. In this paper, we first unify token and span level supervisions\nand propose a Consistent Dual Adaptive Prototypical (CDAP) network for few-shot\nsequence labeling. CDAP contains the token-level and span-level networks,\njointly trained at different granularities. To align the outputs of two\nnetworks, we further propose a consistent loss to enable them to learn from\neach other. During the inference phase, we propose a consistent greedy\ninference algorithm that first adjusts the predicted probability and then\ngreedily selects non-overlapping spans with maximum probability. Extensive\nexperiments show that our model achieves new state-of-the-art results on three\nbenchmark datasets.\n","authors":["Zifeng Cheng","Qingyu Zhou","Zhiwei Jiang","Xuemin Zhao","Yunbo Cao","Qing Gu"],"pdf_url":"https://arxiv.org/pdf/2307.07946v2.pdf","comment":"Accepted by ACM Transactions on Information Systems"},{"id":"http://arxiv.org/abs/2307.10522v1","updated":"2023-07-20T01:48:51Z","published":"2023-07-20T01:48:51Z","title":"Gender-tuning: Empowering Fine-tuning for Debiasing Pre-trained Language\n Models","summary":" Recent studies have revealed that the widely-used Pre-trained Language Models\n(PLMs) propagate societal biases from the large unmoderated pre-training\ncorpora. Existing solutions require debiasing training processes and datasets\nfor debiasing, which are resource-intensive and costly. Furthermore, these\nmethods hurt the PLMs' performance on downstream tasks. In this study, we\npropose Gender-tuning, which debiases the PLMs through fine-tuning on\ndownstream tasks' datasets. For this aim, Gender-tuning integrates Masked\nLanguage Modeling (MLM) training objectives into fine-tuning's training\nprocess. Comprehensive experiments show that Gender-tuning outperforms the\nstate-of-the-art baselines in terms of average gender bias scores in PLMs while\nimproving PLMs' performance on downstream tasks solely using the downstream\ntasks' dataset. Also, Gender-tuning is a deployable debiasing tool for any PLM\nthat works with original fine-tuning.\n","authors":["Somayeh Ghanbarzadeh","Yan Huang","Hamid Palangi","Radames Cruz Moreno","Hamed Khanpour"],"pdf_url":"https://arxiv.org/pdf/2307.10522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10514v1","updated":"2023-07-20T01:26:34Z","published":"2023-07-20T01:26:34Z","title":"Building Socio-culturally Inclusive Stereotype Resources with Community\n Engagement","summary":" With rapid development and deployment of generative language models in global\nsettings, there is an urgent need to also scale our measurements of harm, not\njust in the number and types of harms covered, but also how well they account\nfor local cultural contexts, including marginalized identities and the social\nbiases experienced by them. Current evaluation paradigms are limited in their\nabilities to address this, as they are not representative of diverse, locally\nsituated but global, socio-cultural perspectives. It is imperative that our\nevaluation resources are enhanced and calibrated by including people and\nexperiences from different cultures and societies worldwide, in order to\nprevent gross underestimations or skews in measurements of harm. In this work,\nwe demonstrate a socio-culturally aware expansion of evaluation resources in\nthe Indian societal context, specifically for the harm of stereotyping. We\ndevise a community engaged effort to build a resource which contains\nstereotypes for axes of disparity that are uniquely present in India. The\nresultant resource increases the number of stereotypes known for and in the\nIndian context by over 1000 stereotypes across many unique identities. We also\ndemonstrate the utility and effectiveness of such expanded resources for\nevaluations of language models. CONTENT WARNING: This paper contains examples\nof stereotypes that may be offensive.\n","authors":["Sunipa Dev","Jaya Goyal","Dinesh Tewari","Shachi Dave","Vinodkumar Prabhakaran"],"pdf_url":"https://arxiv.org/pdf/2307.10514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02288v3","updated":"2023-07-20T01:13:27Z","published":"2023-07-05T13:40:57Z","title":"Performance Comparison of Large Language Models on VNHSGE English\n Dataset: OpenAI ChatGPT, Microsoft Bing Chat, and Google Bard","summary":" This paper presents a performance comparison of three large language models\n(LLMs), namely OpenAI ChatGPT, Microsoft Bing Chat (BingChat), and Google Bard,\non the VNHSGE English dataset. The performance of BingChat, Bard, and ChatGPT\n(GPT-3.5) is 92.4\\%, 86\\%, and 79.2\\%, respectively. The results show that\nBingChat is better than ChatGPT and Bard. Therefore, BingChat and Bard can\nreplace ChatGPT while ChatGPT is not yet officially available in Vietnam. The\nresults also indicate that BingChat, Bard and ChatGPT outperform Vietnamese\nstudents in English language proficiency. The findings of this study contribute\nto the understanding of the potential of LLMs in English language education.\nThe remarkable performance of ChatGPT, BingChat, and Bard demonstrates their\npotential as effective tools for teaching and learning English at the high\nschool level.\n","authors":["Xuan-Quy Dao"],"pdf_url":"https://arxiv.org/pdf/2307.02288v3.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.10512v1","updated":"2023-07-20T01:11:14Z","published":"2023-07-20T01:11:14Z","title":"IvyGPT: InteractiVe Chinese pathwaY language model in medical domain","summary":" General large language models (LLMs) such as ChatGPT have shown remarkable\nsuccess. However, such LLMs have not been widely adopted for medical purposes,\ndue to poor accuracy and inability to provide medical advice. We propose\nIvyGPT, an LLM based on LLaMA that is trained and fine-tuned with high-quality\nmedical question-answer (QA) instances and Reinforcement Learning from Human\nFeedback (RLHF). After supervised fine-tuning, IvyGPT has good multi-turn\nconversation capabilities, but it cannot perform like a doctor in other\naspects, such as comprehensive diagnosis. Through RLHF, IvyGPT can output\nricher diagnosis and treatment answers that are closer to human. In the\ntraining, we used QLoRA to train 33 billion parameters on a small number of\nNVIDIA A100 (80GB) GPUs. Experimental results show that IvyGPT has outperformed\nother medical GPT models.\n","authors":["Rongsheng Wang","Yaofei Duan","ChanTong Lam","Jiexi Chen","Jiangsheng Xu","Haoming Chen","Xiaohong Liu","Patrick Cheong-Iao Pang","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2307.10512v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.11408v2","updated":"2023-07-20T00:58:30Z","published":"2023-05-19T03:31:42Z","title":"AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide\n for Simultaneous Speech Translation","summary":" Attention is the core mechanism of today's most used architectures for\nnatural language processing and has been analyzed from many perspectives,\nincluding its effectiveness for machine translation-related tasks. Among these\nstudies, attention resulted to be a useful source of information to get\ninsights about word alignment also when the input text is substituted with\naudio segments, as in the case of the speech translation (ST) task. In this\npaper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that\nexploits the attention information to generate source-target alignments that\nguide the model during inference. Through experiments on the 8 language pairs\nof MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art\nSimulST policies applied to offline-trained models with gains in terms of BLEU\nof 2 points and latency reductions ranging from 0.5s to 0.8s across the 8\nlanguages.\n","authors":["Sara Papi","Marco Turchi","Matteo Negri"],"pdf_url":"https://arxiv.org/pdf/2305.11408v2.pdf","comment":"Accepted at Interspeech 2023"},{"id":"http://arxiv.org/abs/2307.09702v2","updated":"2023-07-20T00:40:41Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for Large Language Models","summary":" In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10511v1","updated":"2023-07-20T00:36:41Z","published":"2023-07-20T00:36:41Z","title":"General Debiasing for Multimodal Sentiment Analysis","summary":" Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal\ninformation for prediction yet unavoidably suffers from fitting the spurious\ncorrelations between multimodal features and sentiment labels. For example, if\nmost videos with a blue background have positive labels in a dataset, the model\nwill rely on such correlations for prediction, while ``blue background'' is not\na sentiment-related feature. To address this problem, we define a general\ndebiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD)\ngeneralization ability of MSA models by reducing their reliance on spurious\ncorrelations. To this end, we propose a general debiasing framework based on\nInverse Probability Weighting (IPW), which adaptively assigns small weights to\nthe samples with larger bias i.e., the severer spurious correlations). The key\nto this debiasing framework is to estimate the bias of each sample, which is\nachieved by two steps: 1) disentangling the robust features and biased features\nin each modality, and 2) utilizing the biased features to estimate the bias.\nFinally, we employ IPW to reduce the effects of large-biased samples,\nfacilitating robust feature learning for sentiment prediction. To examine the\nmodel's generalization ability, we keep the original testing sets on two\nbenchmarks and additionally construct multiple unimodal and multimodal OOD\ntesting sets. The empirical results demonstrate the superior generalization\nability of our proposed framework. We have released the code and data to\nfacilitate the reproduction.\n","authors":["Teng Sun","Juntong Ni","Wenjie Wang","Liqiang Jing","Yinwei Wei","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2307.10511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11254v1","updated":"2023-07-20T22:10:04Z","published":"2023-07-20T22:10:04Z","title":"A Systematic Evaluation of Federated Learning on Biomedical Natural\n Language Processing","summary":" Language models (LMs) like BERT and GPT have revolutionized natural language\nprocessing (NLP). However, privacy-sensitive domains, particularly the medical\nfield, face challenges to train LMs due to limited data access and privacy\nconstraints imposed by regulations like the Health Insurance Portability and\nAccountability Act (HIPPA) and the General Data Protection Regulation (GDPR).\nFederated learning (FL) offers a decentralized solution that enables\ncollaborative learning while ensuring the preservation of data privacy. In this\nstudy, we systematically evaluate FL in medicine across $2$ biomedical NLP\ntasks using $6$ LMs encompassing $8$ corpora. Our results showed that: 1) FL\nmodels consistently outperform LMs trained on individual client's data and\nsometimes match the model trained with polled data; 2) With the fixed number of\ntotal data, LMs trained using FL with more clients exhibit inferior\nperformance, but pre-trained transformer-based models exhibited greater\nresilience. 3) LMs trained using FL perform nearly on par with the model\ntrained with pooled data when clients' data are IID distributed while\nexhibiting visible gaps with non-IID data. Our code is available at:\nhttps://github.com/PL97/FedNLP\n","authors":["Le Peng","sicheng zhou","jiandong chen","Rui Zhang","Ziyue Xu","Ju Sun"],"pdf_url":"https://arxiv.org/pdf/2307.11254v1.pdf","comment":"Accepted by KDD 2023 Workshop FL4Data-Mining"},{"id":"http://arxiv.org/abs/2307.11224v1","updated":"2023-07-20T20:37:24Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n Models","summary":" Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. While\nthese models are not exclusively designed for text generation, they excel in\napplications such as dense retrieval and semantic textual similarity. This\npaper details the development of Jina Embeddings, starting with the creation of\na high-quality pairwise and triplet dataset. It underlines the crucial role of\ndata cleaning in dataset preparation, gives in-depth insights into the model\ntraining process, and concludes with a comprehensive performance evaluation\nusing the Massive Textual Embedding Benchmark (MTEB).\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v1.pdf","comment":"9 pages, 2 page appendix, EMNLP 2023 Industrial Track"},{"id":"http://arxiv.org/abs/2307.09782v2","updated":"2023-07-20T18:47:20Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n Using Floating-Point Formats","summary":" In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11170v1","updated":"2023-07-20T18:08:34Z","published":"2023-07-20T18:08:34Z","title":"UMLS-KGI-BERT: Data-Centric Knowledge Integration in Transformers for\n Biomedical Entity Recognition","summary":" Pre-trained transformer language models (LMs) have in recent years become the\ndominant paradigm in applied NLP. These models have achieved state-of-the-art\nperformance on tasks such as information extraction, question answering,\nsentiment analysis, document classification and many others. In the biomedical\ndomain, significant progress has been made in adapting this paradigm to NLP\ntasks that require the integration of domain-specific knowledge as well as\nstatistical modelling of language. In particular, research in this area has\nfocused on the question of how best to construct LMs that take into account not\nonly the patterns of token distribution in medical text, but also the wealth of\nstructured information contained in terminology resources such as the UMLS.\nThis work contributes a data-centric paradigm for enriching the language\nrepresentations of biomedical transformer-encoder LMs by extracting text\nsequences from the UMLS. This allows for graph-based learning objectives to be\ncombined with masked-language pre-training. Preliminary results from\nexperiments in the extension of pre-trained LMs as well as training from\nscratch show that this framework improves downstream performance on multiple\nbiomedical and clinical Named Entity Recognition (NER) tasks.\n","authors":["Aidan Mannion","Thierry Chevalier","Didier Schwab","Lorraine Geouriot"],"pdf_url":"https://arxiv.org/pdf/2307.11170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11585v2","updated":"2023-07-20T14:31:10Z","published":"2023-06-20T15:02:25Z","title":"FAIR: A Causal Framework for Accurately Inferring Judgments Reversals","summary":" Artificial intelligence researchers have made significant advances in legal\nintelligence in recent years. However, the existing studies have not focused on\nthe important value embedded in judgments reversals, which limits the\nimprovement of the efficiency of legal intelligence. In this paper, we propose\na causal Framework for Accurately Inferring case Reversals (FAIR), which models\nthe problem of judgments reversals based on real Chinese judgments. We mine the\ncauses of judgments reversals by causal inference methods and inject the\nobtained causal relationships into the neural network as a priori knowledge.\nAnd then, our framework is validated on a challenging dataset as a legal\njudgment prediction task. The experimental results show that our framework can\ntap the most critical factors in judgments reversal, and the obtained causal\nrelationships can effectively improve the neural network's performance. In\naddition, we discuss the generalization ability of large language models for\nlegal intelligence tasks using ChatGPT as an example. Our experiment has found\nthat the generalization ability of large language models still has defects, and\nmining causal relationships can effectively improve the accuracy and explain\nability of model predictions.\n","authors":["Minghua He","Nanfei Gu","Yuntao Shi","Qionghui Zhang","Yaying Chen"],"pdf_url":"https://arxiv.org/pdf/2306.11585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11788v1","updated":"2023-07-20T18:30:35Z","published":"2023-07-20T18:30:35Z","title":"Applying QNLP to sentiment analysis in finance","summary":" As an application domain where the slightest qualitative improvements can\nyield immense value, finance is a promising candidate for early quantum\nadvantage. Focusing on the rapidly advancing field of Quantum Natural Language\nProcessing (QNLP), we explore the practical applicability of the two central\napproaches DisCoCat and Quantum-Enhanced Long Short-Term Memory (QLSTM) to the\nproblem of sentiment analysis in finance. Utilizing a novel ChatGPT-based data\ngeneration approach, we conduct a case study with more than 1000 realistic\nsentences and find that QLSTMs can be trained substantially faster than\nDisCoCat while also achieving close to classical results for their available\nsoftware implementations.\n","authors":["Jonas Stein","Ivo Christ","Nicolas Kraus","Maximilian Balthasar Mansky","Robert Müller","Claudia Linnhof-Popien"],"pdf_url":"https://arxiv.org/pdf/2307.11788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11787v1","updated":"2023-07-20T16:22:36Z","published":"2023-07-20T16:22:36Z","title":"LLM Cognitive Judgements Differ From Human","summary":" Large Language Models (LLMs) have lately been on the spotlight of\nresearchers, businesses, and consumers alike. While the linguistic capabilities\nof such models have been studied extensively, there is growing interest in\ninvestigating them as cognitive subjects. In the present work I examine GPT-3\nand ChatGPT capabilities on an limited-data inductive reasoning task from the\ncognitive science literature. The results suggest that these models' cognitive\njudgements are not human-like.\n","authors":["Sotiris Lamprinidis"],"pdf_url":"https://arxiv.org/pdf/2307.11787v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2307.11785v1","updated":"2023-07-20T12:44:47Z","published":"2023-07-20T12:44:47Z","title":"Adversarial Conversational Shaping for Intelligent Agents","summary":" The recent emergence of deep learning methods has enabled the research\ncommunity to achieve state-of-the art results in several domains including\nnatural language processing. However, the current robocall system remains\nunstable and inaccurate: text generator and chat-bots can be tedious and\nmisunderstand human-like dialogue. In this work, we study the performance of\ntwo models able to enhance an intelligent conversational agent through\nadversarial conversational shaping: a generative adversarial network with\npolicy gradient (GANPG) and a generative adversarial network with reward for\nevery generation step (REGS) based on the REGS model presented in Li et al.\n[18] . This model is able to assign rewards to both partially and fully\ngenerated text sequences. We discuss performance with different training\ndetails : seq2seq [ 36] and transformers [37 ] in a reinforcement learning\nframework.\n","authors":["Piotr Tarasiewicz","Sultan Kenjeyev","Ilana Sebag","Shehab Alshehabi"],"pdf_url":"https://arxiv.org/pdf/2307.11785v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.11086v1","updated":"2023-07-20T17:59:33Z","published":"2023-07-20T17:59:33Z","title":"PAPR: Proximity Attention Point Rendering","summary":" Learning accurate and parsimonious point cloud representations of scene\nsurfaces from scratch remains a challenge in 3D representation learning.\nExisting point-based methods often suffer from the vanishing gradient problem\nor require a large number of points to accurately model scene geometry and\ntexture. To address these limitations, we propose Proximity Attention Point\nRendering (PAPR), a novel method that consists of a point-based scene\nrepresentation and a differentiable renderer. Our scene representation uses a\npoint cloud where each point is characterized by its spatial position,\nforeground score, and view-independent feature vector. The renderer selects the\nrelevant points for each ray and produces accurate colours using their\nassociated features. PAPR effectively learns point cloud positions to represent\nthe correct scene geometry, even when the initialization drastically differs\nfrom the target geometry. Notably, our method captures fine texture details\nwhile using only a parsimonious set of points. We also demonstrate four\npractical applications of our method: geometry editing, object manipulation,\ntexture transfer, and exposure control. More results and code are available on\nour project website at https://zvict.github.io/papr/.\n","authors":["Yanshu Zhang","Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2307.11086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07269v2","updated":"2023-07-20T17:59:25Z","published":"2023-07-14T10:50:43Z","title":"Frequency Domain Adversarial Training for Robust Volumetric Medical\n Segmentation","summary":" It is imperative to ensure the robustness of deep learning models in critical\napplications such as, healthcare. While recent advances in deep learning have\nimproved the performance of volumetric medical image segmentation models, these\nmodels cannot be deployed for real-world applications immediately due to their\nvulnerability to adversarial attacks. We present a 3D frequency domain\nadversarial attack for volumetric medical image segmentation models and\ndemonstrate its advantages over conventional input or voxel domain attacks.\nUsing our proposed attack, we introduce a novel frequency domain adversarial\ntraining approach for optimizing a robust model against voxel and frequency\ndomain attacks. Moreover, we propose frequency consistency loss to regulate our\nfrequency domain adversarial training that achieves a better tradeoff between\nmodel's performance on clean and adversarial samples. Code is publicly\navailable at https://github.com/asif-hanif/vafa.\n","authors":["Asif Hanif","Muzammal Naseer","Salman Khan","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.07269v2.pdf","comment":"This paper has been accepted in MICCAI 2023 conference"},{"id":"http://arxiv.org/abs/2307.11085v1","updated":"2023-07-20T17:59:11Z","published":"2023-07-20T17:59:11Z","title":"Representation Learning in Anomaly Detection: Successes, Limits and a\n Grand Challenge","summary":" In this perspective paper, we argue that the dominant paradigm in anomaly\ndetection cannot scale indefinitely and will eventually hit fundamental limits.\nThis is due to the a no free lunch principle for anomaly detection. These\nlimitations can be overcome when there are strong tasks priors, as is the case\nfor many industrial tasks. When such priors do not exists, the task is much\nharder for anomaly detection. We pose two such tasks as grand challenges for\nanomaly detection: i) scientific discovery by anomaly detection ii) a\n\"mini-grand\" challenge of detecting the most anomalous image in the ImageNet\ndataset. We believe new anomaly detection tools and ideas would need to be\ndeveloped to overcome these challenges.\n","authors":["Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2307.11085v1.pdf","comment":"Keynote talk at the Visual Anomaly and Novelty Detection Workshop,\n CVPR'23"},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer : Gated - Long, Short Sequence Transformer for Step\n Recognition in Surgical Videos","summary":" Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11077v1","updated":"2023-07-20T17:55:14Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":" The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v1.pdf","comment":"Accepted by ICCV 2023. Code and Models are publicly available.\n Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2307.11074v1","updated":"2023-07-20T17:53:57Z","published":"2023-07-20T17:53:57Z","title":"Learning Dense UV Completion for Human Mesh Recovery","summary":" Human mesh reconstruction from a single image is challenging in the presence\nof occlusion, which can be caused by self, objects, or other humans. Existing\nmethods either fail to separate human features accurately or lack proper\nsupervision for feature completion. In this paper, we propose Dense Inpainting\nHuman Mesh Recovery (DIMR), a two-stage method that leverages dense\ncorrespondence maps to handle occlusion. Our method utilizes a dense\ncorrespondence map to separate visible human features and completes human\nfeatures on a structured UV map dense human with an attention-based feature\ncompletion module. We also design a feature inpainting training procedure that\nguides the network to learn from unoccluded features. We evaluate our method on\nseveral datasets and demonstrate its superior performance under heavily\noccluded scenarios compared to other methods. Extensive experiments show that\nour method obviously outperforms prior SOTA methods on heavily occluded images\nand achieves comparable results on the standard benchmarks (3DPW).\n","authors":["Yanjun Wang","Qingping Sun","Wenjia Wang","Jun Ling","Zhongang Cai","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2307.11074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11073v1","updated":"2023-07-20T17:53:46Z","published":"2023-07-20T17:53:46Z","title":"OBJECT 3DIT: Language-guided 3D-aware Image Editing","summary":" Existing image editing tools, while powerful, typically disregard the\nunderlying 3D geometry from which the image is projected. As a result, edits\nmade using these tools may become detached from the geometry and lighting\nconditions that are at the foundation of the image formation process. In this\nwork, we formulate the newt ask of language-guided 3D-aware editing, where\nobjects in an image should be edited according to a language instruction in\ncontext of the underlying 3D scene. To promote progress towards this goal, we\nrelease OBJECT: a dataset consisting of 400K editing examples created from\nprocedurally generated 3D scenes. Each example consists of an input image,\nediting instruction in language, and the edited image. We also introduce 3DIT :\nsingle and multi-task models for four editing tasks. Our models show impressive\nabilities to understand the 3D composition of entire scenes, factoring in\nsurrounding objects, surfaces, lighting conditions, shadows, and\nphysically-plausible object configurations. Surprisingly, training on only\nsynthetic scenes from OBJECT, editing capabilities of 3DIT generalize to\nreal-world images.\n","authors":["Oscar Michel","Anand Bhattad","Eli VanderBilt","Ranjay Krishna","Aniruddha Kembhavi","Tanmay Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.11073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01738v2","updated":"2023-07-20T17:53:41Z","published":"2023-07-04T14:14:12Z","title":"Mitigating Calibration Bias Without Fixed Attribute Grouping for\n Improved Fairness in Medical Imaging Analysis","summary":" Trustworthy deployment of deep learning medical imaging models into\nreal-world clinical practice requires that they be calibrated. However, models\nthat are well calibrated overall can still be poorly calibrated for a\nsub-population, potentially resulting in a clinician unwittingly making poor\ndecisions for this group based on the recommendations of the model. Although\nmethods have been shown to successfully mitigate biases across subgroups in\nterms of model accuracy, this work focuses on the open problem of mitigating\ncalibration biases in the context of medical image analysis. Our method does\nnot require subgroup attributes during training, permitting the flexibility to\nmitigate biases for different choices of sensitive attributes without\nre-training. To this end, we propose a novel two-stage method: Cluster-Focal to\nfirst identify poorly calibrated samples, cluster them into groups, and then\nintroduce group-wise focal loss to improve calibration bias. We evaluate our\nmethod on skin lesion classification with the public HAM10000 dataset, and on\npredicting future lesional activity for multiple sclerosis (MS) patients. In\naddition to considering traditional sensitive attributes (e.g. age, sex) with\ndemographic subgroups, we also consider biases among groups with different\nimage-derived attributes, such as lesion load, which are required in medical\nimage analysis. Our results demonstrate that our method effectively controls\ncalibration error in the worst-performing subgroups while preserving prediction\nperformance, and outperforming recent baselines.\n","authors":["Changjian Shui","Justin Szeto","Raghav Mehta","Douglas L. Arnold","Tal Arbel"],"pdf_url":"https://arxiv.org/pdf/2307.01738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11067v1","updated":"2023-07-20T17:46:21Z","published":"2023-07-20T17:46:21Z","title":"CNOS: A Strong Baseline for CAD-based Novel Object Segmentation","summary":" We propose a simple three-stage approach to segment unseen objects in RGB\nimages using their CAD models. Leveraging recent powerful foundation models,\nDINOv2 and Segment Anything, we create descriptors and generate proposals,\nincluding binary masks for a given input RGB image. By matching proposals with\nreference descriptors created from CAD models, we achieve precise object ID\nassignment along with modal masks. We experimentally demonstrate that our\nmethod achieves state-of-the-art results in CAD-based novel object\nsegmentation, surpassing existing approaches on the seven core datasets of the\nBOP challenge by 19.8\\% AP using the same BOP evaluation protocol. Our source\ncode is available at https://github.com/nv-nguyen/cnos.\n","authors":["Van Nguyen Nguyen","Tomas Hodan","Georgy Ponimatkin","Thibault Groueix","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2307.11067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11058v1","updated":"2023-07-20T17:38:55Z","published":"2023-07-20T17:38:55Z","title":"Driving Policy Prediction based on Deep Learning Models","summary":" In this project, we implemented an end-to-end system that takes in combined\nvisual features of video frames from a normal camera and depth information from\na cloud points scanner, and predicts driving policies (vehicle speed and\nsteering angle). We verified the safety of our system by comparing the\npredicted results with standard behaviors by real-world experienced drivers.\nOur test results show that the predictions can be considered as accurate in at\nlease half of the testing cases (50% 80%, depending on the model), and using\ncombined features improved the performance in most cases than using video\nframes only.\n","authors":["Fuxiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.11058v1.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.11052v1","updated":"2023-07-20T17:33:57Z","published":"2023-07-20T17:33:57Z","title":"HRFNet: High-Resolution Forgery Network for Localizing Satellite Image\n Manipulation","summary":" Existing high-resolution satellite image forgery localization methods rely on\npatch-based or downsampling-based training. Both of these training methods have\nmajor drawbacks, such as inaccurate boundaries between pristine and forged\nregions, the generation of unwanted artifacts, etc. To tackle the\naforementioned challenges, inspired by the high-resolution image segmentation\nliterature, we propose a novel model called HRFNet to enable satellite image\nforgery localization effectively. Specifically, equipped with shallow and deep\nbranches, our model can successfully integrate RGB and resampling features in\nboth global and local manners to localize forgery more accurately. We perform\nvarious experiments to demonstrate that our method achieves the best\nperformance, while the memory requirement and processing speed are not\ncompromised compared to existing methods.\n","authors":["Fahim Faisal Niloy","Kishor Kumar Bhaumik","Simon S. Woo"],"pdf_url":"https://arxiv.org/pdf/2307.11052v1.pdf","comment":"ICIP 2023"},{"id":"http://arxiv.org/abs/2307.09023v3","updated":"2023-07-20T17:23:55Z","published":"2023-07-18T07:25:38Z","title":"LA-Net: Landmark-Aware Learning for Reliable Facial Expression\n Recognition under Label Noise","summary":" Facial expression recognition (FER) remains a challenging task due to the\nambiguity of expressions. The derived noisy labels significantly harm the\nperformance in real-world scenarios. To address this issue, we present a new\nFER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks\nto mitigate the impact of label noise from two perspectives. Firstly, LA-Net\nuses landmark information to suppress the uncertainty in expression space and\nconstructs the label distribution of each sample by neighborhood aggregation,\nwhich in turn improves the quality of training supervision. Secondly, the model\nincorporates landmark information into expression representations using the\ndevised expression-landmark contrastive loss. The enhanced expression feature\nextractor can be less susceptible to label noise. Our method can be integrated\nwith any deep neural network for better training supervision without\nintroducing extra inference costs. We conduct extensive experiments on both\nin-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net\nachieves state-of-the-art performance.\n","authors":["Zhiyu Wu","Jinshi Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09023v3.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11035v1","updated":"2023-07-20T17:11:20Z","published":"2023-07-20T17:11:20Z","title":"Cascade-DETR: Delving into High-Quality Universal Object Detection","summary":" Object localization in general environments is a fundamental part of vision\nsystems. While dominating on the COCO benchmark, recent Transformer-based\ndetection methods are not competitive in diverse domains. Moreover, these\nmethods still struggle to very accurately estimate the object bounding boxes in\ncomplex environments.\n We introduce Cascade-DETR for high-quality universal object detection. We\njointly tackle the generalization to diverse domains and localization accuracy\nby proposing the Cascade Attention layer, which explicitly integrates\nobject-centric information into the detection decoder by limiting the attention\nto the previous box prediction. To further enhance accuracy, we also revisit\nthe scoring of queries. Instead of relying on classification scores, we predict\nthe expected IoU of the query, leading to substantially more well-calibrated\nconfidences. Lastly, we introduce a universal object detection benchmark,\nUDB10, that contains 10 datasets from diverse domains. While also advancing the\nstate-of-the-art on COCO, Cascade-DETR substantially improves DETR-based\ndetectors on all datasets in UDB10, even by over 10 mAP in some cases. The\nimprovements under stringent quality requirements are even more pronounced. Our\ncode and models will be released at https://github.com/SysCV/cascade-detr.\n","authors":["Mingqiao Ye","Lei Ke","Siyuan Li","Yu-Wing Tai","Chi-Keung Tang","Martin Danelljan","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2307.11035v1.pdf","comment":"Accepted in ICCV 2023. Our code and models will be released at\n https://github.com/SysCV/cascade-detr"},{"id":"http://arxiv.org/abs/2305.05610v2","updated":"2023-07-20T16:46:36Z","published":"2023-05-09T17:01:17Z","title":"Can point cloud networks learn statistical shape models of anatomies?","summary":" Statistical Shape Modeling (SSM) is a valuable tool for investigating and\nquantifying anatomical variations within populations of anatomies. However,\ntraditional correspondence-based SSM generation methods have a prohibitive\ninference process and require complete geometric proxies (e.g., high-resolution\nbinary volumes or surface meshes) as input shapes to construct the SSM.\nUnordered 3D point cloud representations of shapes are more easily acquired\nfrom various medical imaging practices (e.g., thresholded images and surface\nscanning). Point cloud deep networks have recently achieved remarkable success\nin learning permutation-invariant features for different point cloud tasks\n(e.g., completion, semantic segmentation, classification). However, their\napplication to learning SSM from point clouds is to-date unexplored. In this\nwork, we demonstrate that existing point cloud encoder-decoder-based completion\nnetworks can provide an untapped potential for SSM, capturing population-level\nstatistical representations of shapes while reducing the inference burden and\nrelaxing the input requirement. We discuss the limitations of these techniques\nto the SSM application and suggest future improvements. Our work paves the way\nfor further exploration of point cloud deep learning for SSM, a promising\navenue for advancing shape analysis literature and broadening SSM to diverse\nuse cases.\n","authors":["Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2305.05610v2.pdf","comment":"Accepted to MICCAI 2023. 13 pages, 5 figures, appendix"},{"id":"http://arxiv.org/abs/2307.11017v1","updated":"2023-07-20T16:45:16Z","published":"2023-07-20T16:45:16Z","title":"Multi-objective point cloud autoencoders for explainable myocardial\n infarction prediction","summary":" Myocardial infarction (MI) is one of the most common causes of death in the\nworld. Image-based biomarkers commonly used in the clinic, such as ejection\nfraction, fail to capture more complex patterns in the heart's 3D anatomy and\nthus limit diagnostic accuracy. In this work, we present the multi-objective\npoint cloud autoencoder as a novel geometric deep learning approach for\nexplainable infarction prediction, based on multi-class 3D point cloud\nrepresentations of cardiac anatomy and function. Its architecture consists of\nmultiple task-specific branches connected by a low-dimensional latent space to\nallow for effective multi-objective learning of both reconstruction and MI\nprediction, while capturing pathology-specific 3D shape information in an\ninterpretable latent space. Furthermore, its hierarchical branch design with\npoint cloud-based deep learning operations enables efficient multi-scale\nfeature learning directly on high-resolution anatomy point clouds. In our\nexperiments on a large UK Biobank dataset, the multi-objective point cloud\nautoencoder is able to accurately reconstruct multi-temporal 3D shapes with\nChamfer distances between predicted and input anatomies below the underlying\nimages' pixel resolution. Our method outperforms multiple machine learning and\ndeep learning benchmarks for the task of incident MI prediction by 19% in terms\nof Area Under the Receiver Operating Characteristic curve. In addition, its\ntask-specific compact latent space exhibits easily separable control and MI\nclusters with clinically plausible associations between subject encodings and\ncorresponding 3D shapes, thus demonstrating the explainability of the\nprediction.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.11017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05797v2","updated":"2023-07-20T16:36:32Z","published":"2023-05-09T23:01:05Z","title":"Fully Bayesian VIB-DeepSSM","summary":" Statistical shape modeling (SSM) enables population-based quantitative\nanalysis of anatomical shapes, informing clinical diagnosis. Deep learning\napproaches predict correspondence-based SSM directly from unsegmented 3D images\nbut require calibrated uncertainty quantification, motivating Bayesian\nformulations. Variational information bottleneck DeepSSM (VIB-DeepSSM) is an\neffective, principled framework for predicting probabilistic shapes of anatomy\nfrom images with aleatoric uncertainty quantification. However, VIB is only\nhalf-Bayesian and lacks epistemic uncertainty inference. We derive a fully\nBayesian VIB formulation and demonstrate the efficacy of two scalable\nimplementation approaches: concrete dropout and batch ensemble. Additionally,\nwe introduce a novel combination of the two that further enhances uncertainty\ncalibration via multimodal marginalization. Experiments on synthetic shapes and\nleft atrium data demonstrate that the fully Bayesian VIB network predicts SSM\nfrom images with improved uncertainty reasoning without sacrificing accuracy.\n","authors":["Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2305.05797v2.pdf","comment":"Accepted to MICCAI 2023. 13 pages, 4 figures, appendix"},{"id":"http://arxiv.org/abs/2210.05335v3","updated":"2023-07-20T16:24:14Z","published":"2022-10-11T10:54:54Z","title":"MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model","summary":" Multimodal semantic understanding often has to deal with uncertainty, which\nmeans the obtained messages tend to refer to multiple targets. Such uncertainty\nis problematic for our interpretation, including inter- and intra-modal\nuncertainty. Little effort has studied the modeling of this uncertainty,\nparticularly in pre-training on unlabeled datasets and fine-tuning in\ntask-specific downstream datasets. In this paper, we project the\nrepresentations of all modalities as probabilistic distributions via a\nProbability Distribution Encoder (PDE) by utilizing sequence-level\ninteractions. Compared to the existing deterministic methods, such uncertainty\nmodeling can convey richer multimodal semantic information and more complex\nrelationships. Furthermore, we integrate uncertainty modeling with popular\npre-training frameworks and propose suitable pre-training tasks:\nDistribution-based Vision-Language Contrastive learning (D-VLC),\nDistribution-based Masked Language Modeling (D-MLM), and Distribution-based\nImage-Text Matching (D-ITM). The fine-tuned models are applied to challenging\ndownstream tasks, including image-text retrieval, visual question answering,\nvisual reasoning, and visual entailment, and achieve state-of-the-art results.\n","authors":["Yatai Ji","Junjie Wang","Yuan Gong","Lin Zhang","Yanru Zhu","Hongfa Wang","Jiaxing Zhang","Tetsuya Sakai","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05335v3.pdf","comment":"CVPR 2023 Main Track Long Paper"},{"id":"http://arxiv.org/abs/2307.10984v1","updated":"2023-07-20T16:14:23Z","published":"2023-07-20T16:14:23Z","title":"Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image","summary":" Reconstructing accurate 3D scenes from images is a long-standing vision task.\nDue to the ill-posedness of the single-image reconstruction problem, most\nwell-established methods are built upon multi-view geometry. State-of-the-art\n(SOTA) monocular metric depth estimation methods can only handle a single\ncamera model and are unable to perform mixed-data training due to the metric\nambiguity. Meanwhile, SOTA monocular methods trained on large mixed datasets\nachieve zero-shot generalization by learning affine-invariant depths, which\ncannot recover real-world metrics. In this work, we show that the key to a\nzero-shot single-view metric depth model lies in the combination of large-scale\ndata training and resolving the metric ambiguity from various camera models. We\npropose a canonical camera space transformation module, which explicitly\naddresses the ambiguity problems and can be effortlessly plugged into existing\nmonocular models. Equipped with our module, monocular models can be stably\ntrained with over 8 million images with thousands of camera models, resulting\nin zero-shot generalization to in-the-wild images with unseen camera settings.\nExperiments demonstrate SOTA performance of our method on 7 zero-shot\nbenchmarks. Notably, our method won the championship in the 2nd Monocular Depth\nEstimation Challenge. Our method enables the accurate recovery of metric 3D\nstructures on randomly collected internet images, paving the way for plausible\nsingle-image metrology. The potential benefits extend to downstream tasks,\nwhich can be significantly improved by simply plugging in our model. For\nexample, our model relieves the scale drift issues of monocular-SLAM (Fig. 1),\nleading to high-quality metric scale dense mapping. The code is available at\nhttps://github.com/YvanYin/Metric3D.\n","authors":["Wei Yin","Chi Zhang","Hao Chen","Zhipeng Cai","Gang Yu","Kaixuan Wang","Xiaozhi Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2307.10984v1.pdf","comment":"Accepted to ICCV 2023. Won the championship in the 2nd Monocular\n Depth Estimation Challenge. The code is available at\n https://github.com/YvanYin/Metric3D"},{"id":"http://arxiv.org/abs/2307.09676v2","updated":"2023-07-20T16:04:11Z","published":"2023-07-18T23:06:47Z","title":"Domain Adaptation based Enhanced Detection for Autonomous Driving in\n Foggy and Rainy Weather","summary":" Typically, object detection methods for autonomous driving that rely on\nsupervised learning make the assumption of a consistent feature distribution\nbetween the training and testing data, however such assumption may fail in\ndifferent weather conditions. Due to the domain gap, a detection model trained\nunder clear weather may not perform well in foggy and rainy conditions.\nOvercoming detection bottlenecks in foggy and rainy weather is a real challenge\nfor autonomous vehicles deployed in the wild. To bridge the domain gap and\nimprove the performance of object detectionin foggy and rainy weather, this\npaper presents a novel framework for domain-adaptive object detection. The\nadaptations at both the image-level and object-level are intended to minimize\nthe differences in image style and object appearance between domains.\nFurthermore, in order to improve the model's performance on challenging\nexamples, we introduce a novel adversarial gradient reversal layer that\nconducts adversarial mining on difficult instances in addition to domain\nadaptation. Additionally, we suggest generating an auxiliary domain through\ndata augmentation to enforce a new domain-level metric regularization.\nExperimental findings on public V2V benchmark exhibit a substantial enhancement\nin object detection specifically for foggy and rainy driving scenarios.\n","authors":["Jinlong Li","Runsheng Xu","Jin Ma","Qin Zou","Jiaqi Ma","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2307.09676v2.pdf","comment":"only change the title of this paper"},{"id":"http://arxiv.org/abs/2307.10974v1","updated":"2023-07-20T16:00:19Z","published":"2023-07-20T16:00:19Z","title":"Deep Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v1.pdf","comment":"22 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.10955v1","updated":"2023-07-20T15:26:57Z","published":"2023-07-20T15:26:57Z","title":"Spinal nerve segmentation method and dataset construction in endoscopic\n surgical scenarios","summary":" Endoscopic surgery is currently an important treatment method in the field of\nspinal surgery and avoiding damage to the spinal nerves through video guidance\nis a key challenge. This paper presents the first real-time segmentation method\nfor spinal nerves in endoscopic surgery, which provides crucial navigational\ninformation for surgeons. A finely annotated segmentation dataset of\napproximately 10,000 consec-utive frames recorded during surgery is constructed\nfor the first time for this field, addressing the problem of semantic\nsegmentation. Based on this dataset, we propose FUnet (Frame-Unet), which\nachieves state-of-the-art performance by utilizing inter-frame information and\nself-attention mechanisms. We also conduct extended exper-iments on a similar\npolyp endoscopy video dataset and show that the model has good generalization\nability with advantageous performance. The dataset and code of this work are\npresented at: https://github.com/zzzzzzpc/FUnet .\n","authors":["Shaowu Peng","Pengcheng Zhao","Yongyu Ye","Junying Chen","Yunbing Chang","Xiaoqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.10955v1.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.10954v1","updated":"2023-07-20T15:26:01Z","published":"2023-07-20T15:26:01Z","title":"Soft-tissue Driven Craniomaxillofacial Surgical Planning","summary":" In CMF surgery, the planning of bony movement to achieve a desired facial\noutcome is a challenging task. Current bone driven approaches focus on\nnormalizing the bone with the expectation that the facial appearance will be\ncorrected accordingly. However, due to the complex non-linear relationship\nbetween bony structure and facial soft-tissue, such bone-driven methods are\ninsufficient to correct facial deformities. Despite efforts to simulate facial\nchanges resulting from bony movement, surgical planning still relies on\niterative revisions and educated guesses. To address these issues, we propose a\nsoft-tissue driven framework that can automatically create and verify surgical\nplans. Our framework consists of a bony planner network that estimates the bony\nmovements required to achieve the desired facial outcome and a facial simulator\nnetwork that can simulate the possible facial changes resulting from the\nestimated bony movement plans. By combining these two models, we can verify and\ndetermine the final bony movement required for planning. The proposed framework\nwas evaluated using a clinical dataset, and our experimental results\ndemonstrate that the soft-tissue driven approach greatly improves the accuracy\nand efficacy of surgical planning when compared to the conventional bone-driven\napproach.\n","authors":["Xi Fang","Daeseung Kim","Xuanang Xu","Tianshu Kuang","Nathan Lampen","Jungwook Lee","Hannah H. Deng","Jaime Gateno","Michael A. K. Liebschner","James J. Xia","Pingkun Yan"],"pdf_url":"https://arxiv.org/pdf/2307.10954v1.pdf","comment":"Early accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.10953v1","updated":"2023-07-20T15:25:55Z","published":"2023-07-20T15:25:55Z","title":"PE-YOLO: Pyramid Enhancement Network for Dark Object Detection","summary":" Current object detection models have achieved good results on many benchmark\ndatasets, detecting objects in dark conditions remains a large challenge. To\naddress this issue, we propose a pyramid enhanced network (PENet) and joint it\nwith YOLOv3 to build a dark object detection framework named PE-YOLO. Firstly,\nPENet decomposes the image into four components of different resolutions using\nthe Laplacian pyramid. Specifically we propose a detail processing module (DPM)\nto enhance the detail of images, which consists of context branch and edge\nbranch. In addition, we propose a low-frequency enhancement filter (LEF) to\ncapture low-frequency semantics and prevent high-frequency noise. PE-YOLO\nadopts an end-to-end joint training approach and only uses normal detection\nloss to simplify the training process. We conduct experiments on the low-light\nobject detection dataset ExDark to demonstrate the effectiveness of ours. The\nresults indicate that compared with other dark detectors and low-light\nenhancement models, PE-YOLO achieves the advanced results, achieving 78.0% in\nmAP and 53.6 in FPS, respectively, which can adapt to object detection under\ndifferent low-light conditions. The code is available at\nhttps://github.com/XiangchenYin/PE-YOLO.\n","authors":["Xiangchen Yin","Zhenda Yu","Zetao Fei","Wenjun Lv","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2307.10953v1.pdf","comment":"Accepted at ICANN 2023"},{"id":"http://arxiv.org/abs/2307.10947v1","updated":"2023-07-20T15:21:28Z","published":"2023-07-20T15:21:28Z","title":"Improving Online Lane Graph Extraction by Object-Lane Clustering","summary":" Autonomous driving requires accurate local scene understanding information.\nTo this end, autonomous agents deploy object detection and online BEV lane\ngraph extraction methods as a part of their perception stack. In this work, we\npropose an architecture and loss formulation to improve the accuracy of local\nlane graph estimates by using 3D object detection outputs. The proposed method\nlearns to assign the objects to centerlines by considering the centerlines as\ncluster centers and the objects as data points to be assigned a probability\ndistribution over the cluster centers. This training scheme ensures direct\nsupervision on the relationship between lanes and objects, thus leading to\nbetter performance. The proposed method improves lane graph estimation\nsubstantially over state-of-the-art methods. The extensive ablations show that\nour method can achieve significant performance improvements by using the\noutputs of existing 3D object detection methods. Since our method uses the\ndetection outputs rather than detection method intermediate representations, a\nsingle model of our method can use any detection method at test time.\n","authors":["Yigit Baran Can","Alexander Liniger","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.10947v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10943v1","updated":"2023-07-20T15:13:29Z","published":"2023-07-20T15:13:29Z","title":"Proxy Anchor-based Unsupervised Learning for Continuous Generalized\n Category Discovery","summary":" Recent advances in deep learning have significantly improved the performance\nof various computer vision applications. However, discovering novel categories\nin an incremental learning scenario remains a challenging problem due to the\nlack of prior knowledge about the number and nature of new categories. Existing\nmethods for novel category discovery are limited by their reliance on labeled\ndatasets and prior knowledge about the number of novel categories and the\nproportion of novel samples in the batch. To address the limitations and more\naccurately reflect real-world scenarios, in this paper, we propose a novel\nunsupervised class incremental learning approach for discovering novel\ncategories on unlabeled sets without prior knowledge. The proposed method\nfine-tunes the feature extractor and proxy anchors on labeled sets, then splits\nsamples into old and novel categories and clusters on the unlabeled dataset.\nFurthermore, the proxy anchors-based exemplar generates representative category\nvectors to mitigate catastrophic forgetting. Experimental results demonstrate\nthat our proposed approach outperforms the state-of-the-art methods on\nfine-grained datasets under real-world scenarios.\n","authors":["Hyungmin Kim","Sungho Suh","Daehwan Kim","Daun Jeong","Hansang Cho","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.10943v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2301.06262v2","updated":"2023-07-20T15:09:50Z","published":"2023-01-16T05:08:50Z","title":"Collaborative Perception in Autonomous Driving: Methods, Datasets and\n Challenges","summary":" Collaborative perception is essential to address occlusion and sensor failure\nissues in autonomous driving. In recent years, theoretical and experimental\ninvestigations of novel works for collaborative perception have increased\ntremendously. So far, however, few reviews have focused on systematical\ncollaboration modules and large-scale collaborative perception datasets. This\nwork reviews recent achievements in this field to bridge this gap and motivate\nfuture research. We start with a brief overview of collaboration schemes. After\nthat, we systematically summarize the collaborative perception methods for\nideal scenarios and real-world issues. The former focus on collaboration\nmodules and efficiency, and the latter is devoted to addressing the problems in\nactual application. Furthermore, we present large-scale public datasets and\nsummarize quantitative results on these benchmarks. Finally, we highlight gaps\nand overlooked challenges between current academic research and real-world\napplications.\n","authors":["Yushan Han","Hui Zhang","Huifang Li","Yi Jin","Congyan Lang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2301.06262v2.pdf","comment":"18 pages, 6 figures. Accepted by IEEE Intelligent Transportation\n Systems Magazine. URL:\n https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving"},{"id":"http://arxiv.org/abs/2307.10934v1","updated":"2023-07-20T15:06:44Z","published":"2023-07-20T15:06:44Z","title":"OCTraN: 3D Occupancy Convolutional Transformer Network in Unstructured\n Traffic Scenarios","summary":" Modern approaches for vision-centric environment perception for autonomous\nnavigation make extensive use of self-supervised monocular depth estimation\nalgorithms that output disparity maps. However, when this disparity map is\nprojected onto 3D space, the errors in disparity are magnified, resulting in a\ndepth estimation error that increases quadratically as the distance from the\ncamera increases. Though Light Detection and Ranging (LiDAR) can solve this\nissue, it is expensive and not feasible for many applications. To address the\nchallenge of accurate ranging with low-cost sensors, we propose, OCTraN, a\ntransformer architecture that uses iterative-attention to convert 2D image\nfeatures into 3D occupancy features and makes use of convolution and transpose\nconvolution to efficiently operate on spatial information. We also develop a\nself-supervised training pipeline to generalize the model to any scene by\neliminating the need for LiDAR ground truth by substituting it with\npseudo-ground truth labels obtained from boosted monocular depth estimation.\n","authors":["Aditya Nalgunda Ganesh","Dhruval Pobbathi Badrinath","Harshith Mohan Kumar","Priya SS","Surabhi Narayan"],"pdf_url":"https://arxiv.org/pdf/2307.10934v1.pdf","comment":"This work was accepted as a spotlight presentation at the\n Transformers for Vision Workshop @CVPR 2023"},{"id":"http://arxiv.org/abs/2307.10927v1","updated":"2023-07-20T14:56:29Z","published":"2023-07-20T14:56:29Z","title":"Modeling 3D cardiac contraction and relaxation with point cloud\n deformation networks","summary":" Global single-valued biomarkers of cardiac function typically used in\nclinical practice, such as ejection fraction, provide limited insight on the\ntrue 3D cardiac deformation process and hence, limit the understanding of both\nhealthy and pathological cardiac mechanics. In this work, we propose the Point\nCloud Deformation Network (PCD-Net) as a novel geometric deep learning approach\nto model 3D cardiac contraction and relaxation between the extreme ends of the\ncardiac cycle. It employs the recent advances in point cloud-based deep\nlearning into an encoder-decoder structure, in order to enable efficient\nmulti-scale feature learning directly on multi-class 3D point cloud\nrepresentations of the cardiac anatomy. We evaluate our approach on a large\ndataset of over 10,000 cases from the UK Biobank study and find average Chamfer\ndistances between the predicted and ground truth anatomies below the pixel\nresolution of the underlying image acquisition. Furthermore, we observe similar\nclinical metrics between predicted and ground truth populations and show that\nthe PCD-Net can successfully capture subpopulation-specific differences between\nnormal subjects and myocardial infarction (MI) patients. We then demonstrate\nthat the learned 3D deformation patterns outperform multiple clinical\nbenchmarks by 13% and 7% in terms of area under the receiver operating\ncharacteristic curve for the tasks of prevalent MI detection and incident MI\nprediction and by 7% in terms of Harrell's concordance index for MI survival\nanalysis.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.10927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10926v1","updated":"2023-07-20T14:52:45Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n segmentation","summary":" Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquax","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.10924v1","updated":"2023-07-20T14:51:28Z","published":"2023-07-20T14:51:28Z","title":"Intrinsic Appearance Decomposition Using Point Cloud Representation","summary":" Intrinsic decomposition is to infer the albedo and shading from the image.\nSince it is a heavily ill-posed problem, previous methods rely on prior\nassumptions from 2D images, however, the exploration of the data representation\nitself is limited. The point cloud is known as a rich format of scene\nrepresentation, which naturally aligns the geometric information and the color\ninformation of an image. Our proposed method, Point Intrinsic Net, in short,\nPoInt-Net, jointly predicts the albedo, light source direction, and shading,\nusing point cloud representation. Experiments reveal the benefits of PoInt-Net,\nin terms of accuracy, it outperforms 2D representation approaches on multiple\nmetrics across datasets; in terms of efficiency, it trains on small-scale point\nclouds and performs stably on any-scale point clouds; in terms of robustness,\nit only trains on single object level dataset, and demonstrates reasonable\ngeneralization ability for unseen objects and scenes.\n","authors":["Xiaoyan Xing","Konrad Groh","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2307.10924v1.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2307.10922v1","updated":"2023-07-20T14:47:50Z","published":"2023-07-20T14:47:50Z","title":"Language-based Action Concept Spaces Improve Video Self-Supervised\n Learning","summary":" Recent contrastive language image pre-training has led to learning highly\ntransferable and robust image representations. However, adapting these models\nto video domains with minimal supervision remains an open problem. We explore a\nsimple step in that direction, using language tied self-supervised learning to\nadapt an image CLIP model to the video domain. A backbone modified for temporal\nmodeling is trained under self-distillation settings with train objectives\noperating in an action concept space. Feature vectors of various action\nconcepts extracted from a language encoder using relevant textual prompts\nconstruct this space. We introduce two train objectives, concept distillation\nand concept alignment, that retain generality of original representations while\nenforcing relations between actions and their attributes. Our approach improves\nzero-shot and linear probing performance on three action recognition\nbenchmarks.\n","authors":["Kanchana Ranasinghe","Michael Ryoo"],"pdf_url":"https://arxiv.org/pdf/2307.10922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10915v1","updated":"2023-07-20T14:39:46Z","published":"2023-07-20T14:39:46Z","title":"Revisiting Fine-Tuning Strategies for Self-supervised Medical Imaging\n Analysis","summary":" Despite the rapid progress in self-supervised learning (SSL), end-to-end\nfine-tuning still remains the dominant fine-tuning strategy for medical imaging\nanalysis. However, it remains unclear whether this approach is truly optimal\nfor effectively utilizing the pre-trained knowledge, especially considering the\ndiverse categories of SSL that capture different types of features. In this\npaper, we first establish strong contrastive and restorative SSL baselines that\noutperform SOTA methods across four diverse downstream tasks. Building upon\nthese strong baselines, we conduct an extensive fine-tuning analysis across\nmultiple pre-training and fine-tuning datasets, as well as various fine-tuning\ndataset sizes. Contrary to the conventional wisdom of fine-tuning only the last\nfew layers of a pre-trained network, we show that fine-tuning intermediate\nlayers is more effective, with fine-tuning the second quarter (25-50%) of the\nnetwork being optimal for contrastive SSL whereas fine-tuning the third quarter\n(50-75%) of the network being optimal for restorative SSL. Compared to the\nde-facto standard of end-to-end fine-tuning, our best fine-tuning strategy,\nwhich fine-tunes a shallower network consisting of the first three quarters\n(0-75%) of the pre-trained network, yields improvements of as much as 5.48%.\nAdditionally, using these insights, we propose a simple yet effective method to\nleverage the complementary strengths of multiple SSL models, resulting in\nenhancements of up to 3.57% compared to using the best model alone. Hence, our\nfine-tuning strategies not only enhance the performance of individual SSL\nmodels, but also enable effective utilization of the complementary strengths\noffered by multiple SSL models, leading to significant improvements in\nself-supervised medical imaging analysis.\n","authors":["Muhammad Osama Khan","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.10915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10912v1","updated":"2023-07-20T14:34:08Z","published":"2023-07-20T14:34:08Z","title":"WeakPolyp: You Only Look Bounding Box for Polyp Segmentation","summary":" Limited by expensive pixel-level labels, polyp segmentation models are\nplagued by data shortage and suffer from impaired generalization. In contrast,\npolyp bounding box annotations are much cheaper and more accessible. Thus, to\nreduce labeling cost, we propose to learn a weakly supervised polyp\nsegmentation model (i.e., WeakPolyp) completely based on bounding box\nannotations. However, coarse bounding boxes contain too much noise. To avoid\ninterference, we introduce the mask-to-box (M2B) transformation. By supervising\nthe outer box mask of the prediction instead of the prediction itself, M2B\ngreatly mitigates the mismatch between the coarse label and the precise\nprediction. But, M2B only provides sparse supervision, leading to non-unique\npredictions. Therefore, we further propose a scale consistency (SC) loss for\ndense supervision. By explicitly aligning predictions across the same image at\ndifferent scales, the SC loss largely reduces the variation of predictions.\nNote that our WeakPolyp is a plug-and-play model, which can be easily ported to\nother appealing backbones. Besides, the proposed modules are only used during\ntraining, bringing no computation cost to inference. Extensive experiments\ndemonstrate the effectiveness of our proposed WeakPolyp, which surprisingly\nachieves a comparable performance with a fully supervised model, requiring no\nmask annotations at all.\n","authors":["Jun Wei","Yiwen Hu","Shuguang Cui","S. Kevin Zhou","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2307.10912v1.pdf","comment":"accepted by MICCAI 2023, codes are available at\n https://github.com/weijun88/WeakPolyp"},{"id":"http://arxiv.org/abs/2306.14687v2","updated":"2023-07-20T14:29:39Z","published":"2023-06-26T13:32:09Z","title":"GSMorph: Gradient Surgery for cine-MRI Cardiac Deformable Registration","summary":" Deep learning-based deformable registration methods have been widely\ninvestigated in diverse medical applications. Learning-based deformable\nregistration relies on weighted objective functions trading off registration\naccuracy and smoothness of the deformation field. Therefore, they inevitably\nrequire tuning the hyperparameter for optimal registration performance. Tuning\nthe hyperparameters is highly computationally expensive and introduces\nundesired dependencies on domain knowledge. In this study, we construct a\nregistration model based on the gradient surgery mechanism, named GSMorph, to\nachieve a hyperparameter-free balance on multiple losses. In GSMorph, we\nreformulate the optimization procedure by projecting the gradient of similarity\nloss orthogonally to the plane associated with the smoothness constraint,\nrather than additionally introducing a hyperparameter to balance these two\ncompeting terms. Furthermore, our method is model-agnostic and can be merged\ninto any deep registration network without introducing extra parameters or\nslowing down inference. In this study, We compared our method with\nstate-of-the-art (SOTA) deformable registration approaches over two publicly\navailable cardiac MRI datasets. GSMorph proves superior to five SOTA\nlearning-based registration models and two conventional registration\ntechniques, SyN and Demons, on both registration accuracy and smoothness.\n","authors":["Haoran Dou","Ning Bi","Luyi Han","Yuhao Huang","Ritse Mann","Xin Yang","Dong Ni","Nishant Ravikumar","Alejandro F. Frangi","Yunzhi Huang"],"pdf_url":"https://arxiv.org/pdf/2306.14687v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2110.05216v2","updated":"2023-07-20T14:29:07Z","published":"2021-10-11T12:32:56Z","title":"High-order Tensor Pooling with Attention for Action Recognition","summary":" We aim at capturing high-order statistics of feature vectors formed by a\nneural network, and propose end-to-end second- and higher-order pooling to form\na tensor descriptor. Tensor descriptors require a robust similarity measure due\nto low numbers of aggregated vectors and the burstiness phenomenon, when a\ngiven feature appears more/less frequently than statistically expected. The\nHeat Diffusion Process (HDP) on a graph Laplacian is closely related to the\nEigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix,\nwhose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN\nplay the same role, i.e., to boost or dampen the magnitude of the eigenspectrum\nthus preventing the burstiness. We equip higher-order tensors with EPN which\nacts as a spectral detector of higher-order occurrences to prevent burstiness.\nWe also prove that for a tensor of order r built from d dimensional feature\ndescriptors, such a detector gives the likelihood if at least one higher-order\noccurrence is 'projected' into one of binom(d,r) subspaces represented by the\ntensor; thus forming a tensor power normalization metric endowed with\nbinom(d,r) such 'detectors'. For experimental contributions, we apply several\nsecond- and higher-order pooling variants to action recognition, provide\npreviously not presented comparisons of such pooling variants, and show\nstate-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities.\n","authors":["Piotr Koniusz","Lei Wang","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2110.05216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v1","updated":"2023-07-20T14:18:44Z","published":"2023-07-20T14:18:44Z","title":"Variational Point Encoding Deformation for Dental Modeling","summary":" Digital dentistry has made significant advancements in recent years, yet\nnumerous challenges remain to be addressed. In this study, we release a new\nextensive dataset of tooth meshes to encourage further research. Additionally,\nwe propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable\nprobabilistic learning of point cloud representations. A key challenge in\nexisting latent variable models for point clouds is the lack of a 1-to-1\nmapping between input points and output points. Instead, they must rely on\noptimizing Chamfer distances, a metric that does not have a normalized\ndistributional counterpart, preventing its usage in probabilistic models. We\ndemonstrate that explicit minimization of Chamfer distances can be replaced by\na suitable encoder, which allows us to increase computational efficiency while\nsimplifying the probabilistic extension. Our experimental findings present\nempirical evidence demonstrating the superior performance of VF-Net over\nexisting models in terms of dental scan reconstruction and extrapolation.\nAdditionally, our investigation highlights the robustness of VF-Net's latent\nrepresentations. These results underscore the promising prospects of VF-Net as\nan effective and reliable method for point cloud reconstruction and analysis.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10894v1","updated":"2023-07-20T14:15:20Z","published":"2023-07-20T14:15:20Z","title":"Human Motion Generation: A Survey","summary":" Human motion generation aims to generate natural human pose sequences and\nshows immense potential for real-world applications. Substantial progress has\nbeen made recently in motion data collection technologies and generation\nmethods, laying the foundation for increasing interest in human motion\ngeneration. Most research within this field focuses on generating human motions\nbased on conditional signals, such as text, audio, and scene contexts. While\nsignificant advancements have been made in recent years, the task continues to\npose challenges due to the intricate nature of human motion and its implicit\nrelationship with conditional signals. In this survey, we present a\ncomprehensive literature review of human motion generation, which, to the best\nof our knowledge, is the first of its kind in this field. We begin by\nintroducing the background of human motion and generative models, followed by\nan examination of representative methods for three mainstream sub-tasks:\ntext-conditioned, audio-conditioned, and scene-conditioned human motion\ngeneration. Additionally, we provide an overview of common datasets and\nevaluation metrics. Lastly, we discuss open problems and outline potential\nfuture research directions. We hope that this survey could provide the\ncommunity with a comprehensive glimpse of this rapidly evolving field and\ninspire novel ideas that address the outstanding challenges.\n","authors":["Wentao Zhu","Xiaoxuan Ma","Dongwoo Ro","Hai Ci","Jinlu Zhang","Jiaxin Shi","Feng Gao","Qi Tian","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10894v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.10875v1","updated":"2023-07-20T13:47:30Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":" The popularity of point cloud deep models for safety-critical purposes has\nincreased, but the reliability and security of these models can be compromised\nby intentional or naturally occurring point cloud noise. To combat this issue,\nwe present a novel point cloud outlier removal method called PointCVaR, which\nempowers standard-trained models to eliminate additional outliers and restore\nthe data. Our approach begins by conducting attribution analysis to determine\nthe influence of each point on the model output, which we refer to as point\nrisk. We then optimize the process of filtering high-risk points using\nConditional Value at Risk (CVaR) as the objective. The rationale for this\napproach is based on the observation that noise points in point clouds tend to\ncluster in the tail of the risk distribution, with a low frequency but a high\nlevel of risk, resulting in significant interference with classification\nresults. Despite requiring no additional training effort, our method produces\nexceptional results in various removal-and-classification experiments for noisy\npoint clouds, which are corrupted by random noise, adversarial noise, and\nbackdoor trigger noise. Impressively, it achieves 87% accuracy in defense\nagainst the backdoor attack by removing triggers. Overall, the proposed\nPointCVaR effectively eliminates noise points and enhances point cloud\nclassification, making it a promising plug-in module for various models in\ndifferent scenarios.\n","authors":["Xinke Li","Junchi Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10873v1","updated":"2023-07-20T13:43:48Z","published":"2023-07-20T13:43:48Z","title":"Conservative Estimation of Perception Relevance of Dynamic Objects for\n Safe Trajectories in Automotive Scenarios","summary":" Having efficient testing strategies is a core challenge that needs to be\novercome for the release of automated driving. This necessitates clear\nrequirements as well as suitable methods for testing. In this work, the\nrequirements for perception modules are considered with respect to relevance.\nThe concept of relevance currently remains insufficiently defined and\nspecified. In this paper, we propose a novel methodology to overcome this\nchallenge by exemplary application to collision safety in the highway domain.\nUsing this general system and use case specification, a corresponding concept\nfor relevance is derived. Irrelevant objects are thus defined as objects which\ndo not limit the set of safe actions available to the ego vehicle under\nconsideration of all uncertainties. As an initial step, the use case is\ndecomposed into functional scenarios with respect to collision relevance. For\neach functional scenario, possible actions of both the ego vehicle and any\nother dynamic object are formalized as equations. This set of possible actions\nis constrained by traffic rules, yielding relevance criteria. As a result, we\npresent a conservative estimation which dynamic objects are relevant for\nperception and need to be considered for a complete evaluation. The estimation\nprovides requirements which are applicable for offline testing and validation\nof perception components. A visualization is presented for examples from the\nhighD dataset, showing the plausibility of the results. Finally, a possibility\nfor a future validation of the presented relevance concept is outlined.\n","authors":["Ken Mori","Kai Storms","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2307.10873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10867v1","updated":"2023-07-20T13:40:22Z","published":"2023-07-20T13:40:22Z","title":"FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with\n Human Feedback","summary":" Captions are crucial for understanding scientific visualizations and\ndocuments. Existing captioning methods for scientific figures rely on\nfigure-caption pairs extracted from documents for training, many of which fall\nshort with respect to metrics like helpfulness, explainability, and\nvisual-descriptiveness [15] leading to generated captions being misaligned with\nreader preferences. To enable the generation of high-quality figure captions,\nwe introduce FigCaps-HF a new framework for figure-caption generation that can\nincorporate domain expert feedback in generating captions optimized for reader\npreferences. Our framework comprises of 1) an automatic method for evaluating\nquality of figure-caption pairs, 2) a novel reinforcement learning with human\nfeedback (RLHF) method to optimize a generative figure-to-caption model for\nreader preferences. We demonstrate the effectiveness of our simple learning\nframework by improving performance over standard fine-tuning across different\ntypes of models. In particular, when using BLIP as the base model, our RLHF\nframework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and\nMeteor, respectively. Finally, we release a large-scale benchmark dataset with\nhuman feedback on figure-caption pairs to enable further evaluation and\ndevelopment of RLHF techniques for this problem.\n","authors":["Ashish Singh","Prateek Agarwal","Zixuan Huang","Arpita Singh","Tong Yu","Sungchul Kim","Victor Bursztyn","Nikos Vlassis","Ryan A. Rossi"],"pdf_url":"https://arxiv.org/pdf/2307.10867v1.pdf","comment":"19 pages, 4 figures. Benchmark Documentation:\n https://figcapshf.github.io/"},{"id":"http://arxiv.org/abs/2307.10864v1","updated":"2023-07-20T13:33:28Z","published":"2023-07-20T13:33:28Z","title":"Divide & Bind Your Attention for Improved Generative Semantic Nursing","summary":" Emerging large-scale text-to-image generative models, e.g., Stable Diffusion\n(SD), have exhibited overwhelming results with high fidelity. Despite the\nmagnificent progress, current state-of-the-art models still struggle to\ngenerate images fully adhering to the input prompt. Prior work, Attend &\nExcite, has introduced the concept of Generative Semantic Nursing (GSN), aiming\nto optimize cross-attention during inference time to better incorporate the\nsemantics. It demonstrates promising results in generating simple prompts,\ne.g., ``a cat and a dog''. However, its efficacy declines when dealing with\nmore complex prompts, and it does not explicitly address the problem of\nimproper attribute binding. To address the challenges posed by complex prompts\nor scenarios involving multiple entities and to achieve improved attribute\nbinding, we propose Divide & Bind. We introduce two novel loss objectives for\nGSN: a novel attendance loss and a binding loss. Our approach stands out in its\nability to faithfully synthesize desired objects with improved attribute\nalignment from complex prompts and exhibits superior performance across\nmultiple evaluation benchmarks. More videos and updates can be found on the\nproject page \\url{https://sites.google.com/view/divide-and-bind}.\n","authors":["Yumeng Li","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2307.10864v1.pdf","comment":"Project page: \\url{https://sites.google.com/view/divide-and-bind}"},{"id":"http://arxiv.org/abs/2307.10854v1","updated":"2023-07-20T13:17:30Z","published":"2023-07-20T13:17:30Z","title":"BlendFace: Re-designing Identity Encoders for Face-Swapping","summary":" The great advancements of generative adversarial networks and face\nrecognition models in computer vision have made it possible to swap identities\non images from single sources. Although a lot of studies seems to have proposed\nalmost satisfactory solutions, we notice previous methods still suffer from an\nidentity-attribute entanglement that causes undesired attributes swapping\nbecause widely used identity encoders, eg, ArcFace, have some crucial attribute\nbiases owing to their pretraining on face recognition tasks. To address this\nissue, we design BlendFace, a novel identity encoder for face-swapping. The key\nidea behind BlendFace is training face recognition models on blended images\nwhose attributes are replaced with those of another mitigates inter-personal\nbiases such as hairsyles. BlendFace feeds disentangled identity features into\ngenerators and guides generators properly as an identity loss function.\nExtensive experiments demonstrate that BlendFace improves the\nidentity-attribute disentanglement in face-swapping models, maintaining a\ncomparable quantitative performance to previous methods.\n","authors":["Kaede Shiohara","Xingchao Yang","Takafumi Taketomi"],"pdf_url":"https://arxiv.org/pdf/2307.10854v1.pdf","comment":"ICCV2023. Code: https://github.com/mapooon/BlendFace, Webpage:\n https://mapooon.github.io/BlendFacePage/"},{"id":"http://arxiv.org/abs/2307.10853v1","updated":"2023-07-20T13:16:10Z","published":"2023-07-20T13:16:10Z","title":"Exploring Effective Priors and Efficient Models for Weakly-Supervised\n Change Detection","summary":" Weakly-supervised change detection (WSCD) aims to detect pixel-level changes\nwith only image-level annotations. Owing to its label efficiency, WSCD is\ndrawing increasing attention recently. However, current WSCD methods often\nencounter the challenge of change missing and fabricating, i.e., the\ninconsistency between image-level annotations and pixel-level predictions.\nSpecifically, change missing refer to the situation that the WSCD model fails\nto predict any changed pixels, even though the image-level label indicates\nchanged, and vice versa for change fabricating. To address this challenge, in\nthis work, we leverage global-scale and local-scale priors in WSCD and propose\ntwo components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.\nThe DP decoder decodes samples with the changed image-level label, skips\nsamples with the unchanged label, and replaces them with an all-unchanged\npixel-level label. The LG constraint is derived from the correspondence between\nchanged representations and image-level labels, penalizing the model when it\nmispredicts the change status. Additionally, we develop TransWCD, a simple yet\npowerful transformer-based model, showcasing the potential of weakly-supervised\nlearning in change detection. By integrating the DP decoder and LG constraint\ninto TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL\nachieve significant +6.33% and +9.55% F1 score improvements over the\nstate-of-the-art methods on the WHU-CD dataset, respectively. Some performance\nmetrics even exceed several fully-supervised change detection (FSCD)\ncompetitors. Code will be available at\nhttps://github.com/zhenghuizhao/TransWCD.\n","authors":["Zhenghui Zhao","Lixiang Ru","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10845v1","updated":"2023-07-20T13:07:41Z","published":"2023-07-20T13:07:41Z","title":"Self-paced Weight Consolidation for Continual Learning","summary":" Continual learning algorithms which keep the parameters of new tasks close to\nthat of previous tasks, are popular in preventing catastrophic forgetting in\nsequential task learning settings. However, 1) the performance for the new\ncontinual learner will be degraded without distinguishing the contributions of\npreviously learned tasks; 2) the computational cost will be greatly increased\nwith the number of tasks, since most existing algorithms need to regularize all\nprevious tasks when learning new tasks. To address the above challenges, we\npropose a self-paced Weight Consolidation (spWC) framework to attain robust\ncontinual learning via evaluating the discriminative contributions of previous\ntasks. To be specific, we develop a self-paced regularization to reflect the\npriorities of past tasks via measuring difficulty based on key performance\nindicator (i.e., accuracy). When encountering a new task, all previous tasks\nare sorted from \"difficult\" to \"easy\" based on the priorities. Then the\nparameters of the new continual learner will be learned via selectively\nmaintaining the knowledge amongst more difficult past tasks, which could well\novercome catastrophic forgetting with less computational cost. We adopt an\nalternative convex search to iteratively update the model parameters and\npriority weights in the bi-convex formulation. The proposed spWC framework is\nplug-and-play, which is applicable to most continual learning algorithms (e.g.,\nEWC, MAS and RCIL) in different directions (e.g., classification and\nsegmentation). Experimental results on several public benchmark datasets\ndemonstrate that our proposed framework can effectively improve performance\nwhen compared with other popular continual learning algorithms.\n","authors":["Wei Cong","Yang Cong","Gan Sun","Yuyang Liu","Jiahua Dong"],"pdf_url":"https://arxiv.org/pdf/2307.10845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10843v1","updated":"2023-07-20T13:04:26Z","published":"2023-07-20T13:04:26Z","title":"Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals\n for GPM: A U-Net Convolutional LSTM Architecture","summary":" This paper presents a deep learning architecture for nowcasting of\nprecipitation almost globally every 30 min with a 4-hour lead time. The\narchitecture fuses a U-Net and a convolutional long short-term memory (LSTM)\nneural network and is trained using data from the Integrated MultisatellitE\nRetrievals for GPM (IMERG) and a few key precipitation drivers from the Global\nForecast System (GFS). The impacts of different training loss functions,\nincluding the mean-squared error (regression) and the focal-loss\n(classification), on the quality of precipitation nowcasts are studied. The\nresults indicate that the regression network performs well in capturing light\nprecipitation (below 1.6 mm/hr), but the classification network can outperform\nthe regression network for nowcasting of precipitation extremes (>8 mm/hr), in\nterms of the critical success index (CSI).. Using the Wasserstein distance, it\nis shown that the predicted precipitation by the classification network has a\ncloser class probability distribution to the IMERG than the regression network.\nIt is uncovered that the inclusion of the physical variables can improve\nprecipitation nowcasting, especially at longer lead times in both networks.\nTaking IMERG as a relative reference, a multi-scale analysis in terms of\nfractions skill score (FSS), shows that the nowcasting machine remains skillful\n(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For\nprecipitation rates greater than 4~mm/hr, only the classification network\nremains FSS-skillful on scales greater than 50 km within a 2-hour lead time.\n","authors":["Reyhaneh Rahimi","Ardeshir Ebtehaj","Ali Behrangi","Jackson Tan"],"pdf_url":"https://arxiv.org/pdf/2307.10843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10842v1","updated":"2023-07-20T13:02:45Z","published":"2023-07-20T13:02:45Z","title":"Label Calibration for Semantic Segmentation Under Domain Shift","summary":" Performance of a pre-trained semantic segmentation model is likely to\nsubstantially decrease on data from a new domain. We show a pre-trained model\ncan be adapted to unlabelled target domain data by calculating soft-label\nprototypes under the domain shift and making predictions according to the\nprototype closest to the vector with predicted class probabilities. The\nproposed adaptation procedure is fast, comes almost for free in terms of\ncomputational resources and leads to considerable performance improvements. We\ndemonstrate the benefits of such label calibration on the highly-practical\nsynthetic-to-real semantic segmentation problem.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10842v1.pdf","comment":"ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for\n Trustworthy ML"},{"id":"http://arxiv.org/abs/2307.08930v2","updated":"2023-07-20T12:41:19Z","published":"2023-07-18T02:35:01Z","title":"Unsupervised Deep Graph Matching Based on Cycle Consistency","summary":" We contribute to the sparsely populated area of unsupervised deep graph\nmatching with application to keypoint matching in images. Contrary to the\nstandard \\emph{supervised} approach, our method does not require ground truth\ncorrespondences between keypoint pairs. Instead, it is self-supervised by\nenforcing consistency of matchings between images of the same object category.\nAs the matching and the consistency loss are discrete, their derivatives cannot\nbe straightforwardly used for learning. We address this issue in a principled\nway by building our method upon the recent results on black-box differentiation\nof combinatorial solvers. This makes our method exceptionally flexible, as it\nis compatible with arbitrary network architectures and combinatorial solvers.\nOur experimental evaluation suggests that our technique sets a new\nstate-of-the-art for unsupervised graph matching.\n","authors":["Siddharth Tourani","Carsten Rother","Muhammad Haris Khan","Bogdan Savchynskyy"],"pdf_url":"https://arxiv.org/pdf/2307.08930v2.pdf","comment":"12 pages, 5 figures, 3 papers"},{"id":"http://arxiv.org/abs/2307.10824v1","updated":"2023-07-20T12:38:17Z","published":"2023-07-20T12:38:17Z","title":"Parse and Recall: Towards Accurate Lung Nodule Malignancy Prediction\n like Radiologists","summary":" Lung cancer is a leading cause of death worldwide and early screening is\ncritical for improving survival outcomes. In clinical practice, the contextual\nstructure of nodules and the accumulated experience of radiologists are the two\ncore elements related to the accuracy of identification of benign and malignant\nnodules. Contextual information provides comprehensive information about\nnodules such as location, shape, and peripheral vessels, and experienced\nradiologists can search for clues from previous cases as a reference to enrich\nthe basis of decision-making. In this paper, we propose a radiologist-inspired\nmethod to simulate the diagnostic process of radiologists, which is composed of\ncontext parsing and prototype recalling modules. The context parsing module\nfirst segments the context structure of nodules and then aggregates contextual\ninformation for a more comprehensive understanding of the nodule. The prototype\nrecalling module utilizes prototype-based learning to condense previously\nlearned cases as prototypes for comparative analysis, which is updated online\nin a momentum way during training. Building on the two modules, our method\nleverages both the intrinsic characteristics of the nodules and the external\nknowledge accumulated from other nodules to achieve a sound diagnosis. To meet\nthe needs of both low-dose and noncontrast screening, we collect a large-scale\ndataset of 12,852 and 4,029 nodules from low-dose and noncontrast CTs\nrespectively, each with pathology- or follow-up-confirmed labels. Experiments\non several datasets demonstrate that our method achieves advanced screening\nperformance on both low-dose and noncontrast scenarios.\n","authors":["Jianpeng Zhang","Xianghua Ye","Jianfeng Zhang","Yuxing Tang","Minfeng Xu","Jianfei Guo","Xin Chen","Zaiyi Liu","Jingren Zhou","Le Lu","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10824v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2212.13792v2","updated":"2023-07-20T12:37:06Z","published":"2022-12-28T12:08:27Z","title":"Periocular Biometrics: A Modality for Unconstrained Scenarios","summary":" Periocular refers to the externally visible region of the face that surrounds\nthe eye socket. This feature-rich area can provide accurate identification in\nunconstrained or uncooperative scenarios, where the iris or face modalities may\nnot offer sufficient biometric cues due to factors such as partial occlusion or\nhigh subject-to-camera distance. The COVID-19 pandemic has further highlighted\nits importance, as the ocular region remained the only visible facial area even\nin controlled settings due to the widespread use of masks. This paper discusses\nthe state of the art in periocular biometrics, presenting an overall framework\nencompassing its most significant research aspects, which include: (a) ocular\ndefinition, acquisition, and detection; (b) identity recognition, including\ncombination with other modalities and use of various spectra; and (c) ocular\nsoft-biometric analysis. Finally, we conclude by addressing current challenges\nand proposing future directions.\n","authors":["Fernando Alonso-Fernandez","Josef Bigun","Julian Fierrez","Naser Damer","Hugo Proença","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2212.13792v2.pdf","comment":"Published at IEEE Computer journal"},{"id":"http://arxiv.org/abs/2307.10822v1","updated":"2023-07-20T12:32:25Z","published":"2023-07-20T12:32:25Z","title":"Gradient-Semantic Compensation for Incremental Semantic Segmentation","summary":" Incremental semantic segmentation aims to continually learn the segmentation\nof new coming classes without accessing the training data of previously learned\nclasses. However, most current methods fail to address catastrophic forgetting\nand background shift since they 1) treat all previous classes equally without\nconsidering different forgetting paces caused by imbalanced gradient\nback-propagation; 2) lack strong semantic guidance between classes. To tackle\nthe above challenges, in this paper, we propose a Gradient-Semantic\nCompensation (GSC) model, which surmounts incremental semantic segmentation\nfrom both gradient and semantic perspectives. Specifically, to address\ncatastrophic forgetting from the gradient aspect, we develop a step-aware\ngradient compensation that can balance forgetting paces of previously seen\nclasses via re-weighting gradient backpropagation. Meanwhile, we propose a\nsoft-sharp semantic relation distillation to distill consistent inter-class\nsemantic relations via soft labels for alleviating catastrophic forgetting from\nthe semantic aspect. In addition, we develop a prototypical pseudo re-labeling\nthat provides strong semantic guidance to mitigate background shift. It\nproduces high-quality pseudo labels for old classes in the background by\nmeasuring distances between pixels and class-wise prototypes. Extensive\nexperiments on three public datasets, i.e., Pascal VOC 2012, ADE20K, and\nCityscapes, demonstrate the effectiveness of our proposed GSC model.\n","authors":["Wei Cong","Yang Cong","Jiahua Dong","Gan Sun","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2307.10822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10816v1","updated":"2023-07-20T12:25:06Z","published":"2023-07-20T12:25:06Z","title":"BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained\n Diffusion","summary":" Recent text-to-image diffusion models have demonstrated an astonishing\ncapacity to generate high-quality images. However, researchers mainly studied\nthe way of synthesizing images with only text prompts. While some works have\nexplored using other modalities as conditions, considerable paired data, e.g.,\nbox/mask-image pairs, and fine-tuning time are required for nurturing models.\nAs such paired data is time-consuming and labor-intensive to acquire and\nrestricted to a closed set, this potentially becomes the bottleneck for\napplications in an open world. This paper focuses on the simplest form of\nuser-provided conditions, e.g., box or scribble. To mitigate the aforementioned\nproblem, we propose a training-free method to control objects and contexts in\nthe synthesized images adhering to the given spatial conditions. Specifically,\nthree spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints,\nare designed and seamlessly integrated into the denoising step of diffusion\nmodels, requiring no additional training and massive annotated layout data.\nExtensive results show that the proposed constraints can control what and where\nto present in the images while retaining the ability of the Stable Diffusion\nmodel to synthesize with high fidelity and diverse concept coverage. The code\nis publicly available at https://github.com/Sierkinhane/BoxDiff.\n","authors":["Jinheng Xie","Yuexiang Li","Yawen Huang","Haozhe Liu","Wentian Zhang","Yefeng Zheng","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2307.10816v1.pdf","comment":"Accepted by ICCV 2023. The paper is still being revised for better\n organization and comparison"},{"id":"http://arxiv.org/abs/2306.09683v2","updated":"2023-07-20T12:23:12Z","published":"2023-06-16T08:27:46Z","title":"Scaling Open-Vocabulary Object Detection","summary":" Open-vocabulary object detection has benefited greatly from pretrained\nvision-language models, but is still limited by the amount of available\ndetection training data. While detection training data can be expanded by using\nWeb image-text pairs as weak supervision, this has not been done at scales\ncomparable to image-level pretraining. Here, we scale up detection data with\nself-training, which uses an existing detector to generate pseudo-box\nannotations on image-text pairs. Major challenges in scaling self-training are\nthe choice of label space, pseudo-annotation filtering, and training\nefficiency. We present the OWLv2 model and OWL-ST self-training recipe, which\naddress these challenges. OWLv2 surpasses the performance of previous\nstate-of-the-art open-vocabulary detectors already at comparable training\nscales (~10M examples). However, with OWL-ST, we can scale to over 1B examples,\nyielding further large improvement: With an L/14 architecture, OWL-ST improves\nAP on LVIS rare classes, for which the model has seen no human box annotations,\nfrom 31.2% to 44.6% (43% relative improvement). OWL-ST unlocks Web-scale\ntraining for open-world localization, similar to what has been seen for image\nclassification and language modelling.\n","authors":["Matthias Minderer","Alexey Gritsenko","Neil Houlsby"],"pdf_url":"https://arxiv.org/pdf/2306.09683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10813v1","updated":"2023-07-20T12:21:26Z","published":"2023-07-20T12:21:26Z","title":"Perceptual Quality Assessment of Omnidirectional Audio-visual Signals","summary":" Omnidirectional videos (ODVs) play an increasingly important role in the\napplication fields of medical, education, advertising, tourism, etc. Assessing\nthe quality of ODVs is significant for service-providers to improve the user's\nQuality of Experience (QoE). However, most existing quality assessment studies\nfor ODVs only focus on the visual distortions of videos, while ignoring that\nthe overall QoE also depends on the accompanying audio signals. In this paper,\nwe first establish a large-scale audio-visual quality assessment dataset for\nomnidirectional videos, which includes 375 distorted omnidirectional\naudio-visual (A/V) sequences generated from 15 high-quality pristine\nomnidirectional A/V contents, and the corresponding perceptual audio-visual\nquality scores. Then, we design three baseline methods for full-reference\nomnidirectional audio-visual quality assessment (OAVQA), which combine existing\nstate-of-the-art single-mode audio and video QA models via multimodal fusion\nstrategies. We validate the effectiveness of the A/V multimodal fusion method\nfor OAVQA on our dataset, which provides a new benchmark for omnidirectional\nQoE evaluation. Our dataset is available at https://github.com/iamazxl/OAVQA.\n","authors":["Xilei Zhu","Huiyu Duan","Yuqin Cao","Yuxin Zhu","Yucheng Zhu","Jing Liu","Li Chen","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2307.10813v1.pdf","comment":"12 pages, 5 figures, to be published in CICAI2023"},{"id":"http://arxiv.org/abs/2009.03259v2","updated":"2023-07-20T12:11:56Z","published":"2020-09-07T17:27:27Z","title":"Implicit Multidimensional Projection of Local Subspaces","summary":" We propose a visualization method to understand the effect of\nmultidimensional projection on local subspaces, using implicit function\ndifferentiation. Here, we understand the local subspace as the multidimensional\nlocal neighborhood of data points. Existing methods focus on the projection of\nmultidimensional data points, and the neighborhood information is ignored. Our\nmethod is able to analyze the shape and directional information of the local\nsubspace to gain more insights into the global structure of the data through\nthe perception of local structures. Local subspaces are fitted by\nmultidimensional ellipses that are spanned by basis vectors. An accurate and\nefficient vector transformation method is proposed based on analytical\ndifferentiation of multidimensional projections formulated as implicit\nfunctions. The results are visualized as glyphs and analyzed using a full set\nof specifically-designed interactions supported in our efficient web-based\nvisualization tool. The usefulness of our method is demonstrated using various\nmulti- and high-dimensional benchmark datasets. Our implicit differentiation\nvector transformation is evaluated through numerical comparisons; the overall\nmethod is evaluated through exploration examples and use cases.\n","authors":["Rongzheng Bian","Yumeng Xue","Liang Zhou","Jian Zhang","Baoquan Chen","Daniel Weiskopf","Yunhai Wang"],"pdf_url":"https://arxiv.org/pdf/2009.03259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":" Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2307.09906v2","updated":"2023-07-20T12:00:23Z","published":"2023-07-19T11:10:26Z","title":"Implicit Identity Representation Conditioned Memory Compensation Network\n for Talking Head video Generation","summary":" Talking head video generation aims to animate a human face in a still image\nwith dynamic poses and expressions using motion information derived from a\ntarget-driving video, while maintaining the person's identity in the source\nimage. However, dramatic and complex motions in the driving video cause\nambiguous generation, because the still source image cannot provide sufficient\nappearance information for occluded regions or delicate expression variations,\nwhich produces severe artifacts and significantly degrades the generation\nquality. To tackle this problem, we propose to learn a global facial\nrepresentation space, and design a novel implicit identity representation\nconditioned memory compensation network, coined as MCNet, for high-fidelity\ntalking head generation.~Specifically, we devise a network module to learn a\nunified spatial facial meta-memory bank from all training samples, which can\nprovide rich facial structure and appearance priors to compensate warped source\nfacial features for the generation. Furthermore, we propose an effective query\nmechanism based on implicit identity representations learned from the discrete\nkeypoints of the source image. It can greatly facilitate the retrieval of more\ncorrelated information from the memory bank for the compensation. Extensive\nexperiments demonstrate that MCNet can learn representative and complementary\nfacial memory, and can clearly outperform previous state-of-the-art talking\nhead generation methods on VoxCeleb1 and CelebV datasets. Please check our\n\\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}.\n","authors":["Fa-Ting Hong","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09906v2.pdf","comment":"Accepted by ICCV2023, update the reference and figures"},{"id":"http://arxiv.org/abs/2307.10797v1","updated":"2023-07-20T11:59:42Z","published":"2023-07-20T11:59:42Z","title":"HyperReenact: One-Shot Reenactment via Jointly Learning to Refine and\n Retarget Faces","summary":" In this paper, we present our method for neural face reenactment, called\nHyperReenact, that aims to generate realistic talking head images of a source\nidentity, driven by a target facial pose. Existing state-of-the-art face\nreenactment methods train controllable generative models that learn to\nsynthesize realistic facial images, yet producing reenacted faces that are\nprone to significant visual artifacts, especially under the challenging\ncondition of extreme head pose changes, or requiring expensive few-shot\nfine-tuning to better preserve the source identity characteristics. We propose\nto address these limitations by leveraging the photorealistic generation\nability and the disentangled properties of a pretrained StyleGAN2 generator, by\nfirst inverting the real images into its latent space and then using a\nhypernetwork to perform: (i) refinement of the source identity characteristics\nand (ii) facial pose re-targeting, eliminating this way the dependence on\nexternal editing methods that typically produce artifacts. Our method operates\nunder the one-shot setting (i.e., using a single source frame) and allows for\ncross-subject reenactment, without requiring any subject-specific fine-tuning.\nWe compare our method both quantitatively and qualitatively against several\nstate-of-the-art techniques on the standard benchmarks of VoxCeleb1 and\nVoxCeleb2, demonstrating the superiority of our approach in producing\nartifact-free images, exhibiting remarkable robustness even under extreme head\npose changes. We make the code and the pretrained models publicly available at:\nhttps://github.com/StelaBou/HyperReenact .\n","authors":["Stella Bounareli","Christos Tzelepis","Vasileios Argyriou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2307.10797v1.pdf","comment":"Accepted for publication in ICCV 2023. Project page:\n https://stelabou.github.io/hyperreenact.github.io/ Code:\n https://github.com/StelaBou/HyperReenact"},{"id":"http://arxiv.org/abs/2307.10792v1","updated":"2023-07-20T11:45:38Z","published":"2023-07-20T11:45:38Z","title":"Optimizing PatchCore for Few/many-shot Anomaly Detection","summary":" Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and\ntries to distinguish between normal and anomalous data using only few selected\nsamples. While newly proposed few-shot AD methods do compare against\npre-existing algorithms developed for the full-shot domain as baselines, they\ndo not dedicatedly optimize them for the few-shot setting. It thus remains\nunclear if the performance of such pre-existing algorithms can be further\nimproved. We address said question in this work. Specifically, we present a\nstudy on the AD/anomaly segmentation (AS) performance of PatchCore, the current\nstate-of-the-art full-shot AD/AS algorithm, in both the few-shot and the\nmany-shot settings. We hypothesize that further performance improvements can be\nrealized by (I) optimizing its various hyperparameters, and by (II)\ntransferring techniques known to improve few-shot supervised learning to the AD\ndomain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal\nthat (I) significant performance improvements can be realized by optimizing\nhyperparameters such as the underlying feature extractor, and that (II)\nimage-level augmentations can, but are not guaranteed, to improve performance.\nBased on these findings, we achieve a new state of the art in few-shot AD on\nVisA, further demonstrating the merit of adapting pre-existing AD/AS methods to\nthe few-shot setting. Last, we identify the investigation of feature extractors\nwith a strong inductive bias as a potential future research direction for\n(few-shot) AD/AS.\n","authors":["João Santos","Triet Tran","Oliver Rippel"],"pdf_url":"https://arxiv.org/pdf/2307.10792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10790v1","updated":"2023-07-20T11:42:24Z","published":"2023-07-20T11:42:24Z","title":"Behavioral Analysis of Vision-and-Language Navigation Agents","summary":" To be successful, Vision-and-Language Navigation (VLN) agents must be able to\nground instructions to actions based on their surroundings. In this work, we\ndevelop a methodology to study agent behavior on a skill-specific basis --\nexamining how well existing agents ground instructions about stopping, turning,\nand moving towards specified objects or rooms. Our approach is based on\ngenerating skill-specific interventions and measuring changes in agent\npredictions. We present a detailed case study analyzing the behavior of a\nrecent agent and then compare multiple agents in terms of skill-specific\ncompetency scores. This analysis suggests that biases from training have\nlasting effects on agent behavior and that existing models are able to ground\nsimple referring expressions. Our comparisons between models show that\nskill-specific scores correlate with improvements in overall VLN task\nperformance.\n","authors":["Zijiao Yang","Arjun Majumdar","Stefan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10790v1.pdf","comment":"accepted to CVPR2023"},{"id":"http://arxiv.org/abs/2307.10787v1","updated":"2023-07-20T11:36:45Z","published":"2023-07-20T11:36:45Z","title":"Feed-Forward Source-Free Domain Adaptation via Class Prototypes","summary":" Source-free domain adaptation has become popular because of its practical\nusefulness and no need to access source data. However, the adaptation process\nstill takes a considerable amount of time and is predominantly based on\noptimization that relies on back-propagation. In this work we present a simple\nfeed-forward approach that challenges the need for back-propagation based\nadaptation. Our approach is based on computing prototypes of classes under the\ndomain shift using a pre-trained model. It achieves strong improvements in\naccuracy compared to the pre-trained model and requires only a small fraction\nof time of existing domain adaptation methods.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10787v1.pdf","comment":"ECCV 2022 Workshop on Out of Distribution Generalization in Computer\n Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2307.10784v1","updated":"2023-07-20T11:33:46Z","published":"2023-07-20T11:33:46Z","title":"SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with\n 4D Imaging Radar","summary":" The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle\nsensing due to its cost-effectiveness and operability in adverse weather\nconditions. However, the adoption of this technology has been hindered by\nsparsity and noise issues in radar point cloud data. This paper introduces\nspatial multi-representation fusion (SMURF), a novel approach to 3D object\ndetection using a single 4D imaging radar. SMURF leverages multiple\nrepresentations of radar detection points, including pillarization and density\nfeatures of a multi-dimensional Gaussian mixture distribution through kernel\ndensity estimation (KDE). KDE effectively mitigates measurement inaccuracy\ncaused by limited angular resolution and multi-path propagation of radar\nsignals. Additionally, KDE helps alleviate point cloud sparsity by capturing\ndensity features. Experimental evaluations on View-of-Delft (VoD) and\nTJ4DRadSet datasets demonstrate the effectiveness and generalization ability of\nSMURF, outperforming recently proposed 4D imaging radar-based\nsingle-representation models. Moreover, while using 4D imaging radar only,\nSMURF still achieves comparable performance to the state-of-the-art 4D imaging\nradar and camera fusion-based method, with an increase of 1.22% in the mean\naverage precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D\nmean average precision on the entire annotated area of VoD dataset. Our\nproposed method demonstrates impressive inference time and addresses the\nchallenges of real-time detection, with the inference time no more than 0.05\nseconds for most scans on both datasets. This research highlights the benefits\nof 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D\nobject detection with 4D imaging radar.\n","authors":["Jianan Liu","Qiuchi Zhao","Weiyi Xiong","Tao Huang","Qing-Long Han","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.10784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10782v1","updated":"2023-07-20T11:32:51Z","published":"2023-07-20T11:32:51Z","title":"See More and Know More: Zero-shot Point Cloud Segmentation via\n Multi-modal Visual Data","summary":" Zero-shot point cloud segmentation aims to make deep models capable of\nrecognizing novel objects in point cloud that are unseen in the training phase.\nRecent trends favor the pipeline which transfers knowledge from seen classes\nwith labels to unseen classes without labels. They typically align visual\nfeatures with semantic features obtained from word embedding by the supervision\nof seen classes' annotations. However, point cloud contains limited information\nto fully match with semantic features. In fact, the rich appearance information\nof images is a natural complement to the textureless point cloud, which is not\nwell explored in previous literature. Motivated by this, we propose a novel\nmulti-modal zero-shot learning method to better utilize the complementary\ninformation of point clouds and images for more accurate visual-semantic\nalignment. Extensive experiments are performed in two popular benchmarks, i.e.,\nSemanticKITTI and nuScenes, and our method outperforms current SOTA methods\nwith 52% and 49% improvement on average for unseen class mIoU, respectively.\n","authors":["Yuhang Lu","Qi Jiang","Runnan Chen","Yuenan Hou","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2307.10782v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10780v1","updated":"2023-07-20T11:30:12Z","published":"2023-07-20T11:30:12Z","title":"Learned Thresholds Token Merging and Pruning for Vision Transformers","summary":" Vision transformers have demonstrated remarkable success in a wide range of\ncomputer vision tasks over the last years. However, their high computational\ncosts remain a significant barrier to their practical deployment. In\nparticular, the complexity of transformer models is quadratic with respect to\nthe number of input tokens. Therefore techniques that reduce the number of\ninput tokens that need to be processed have been proposed. This paper\nintroduces Learned Thresholds token Merging and Pruning (LTMP), a novel\napproach that leverages the strengths of both token merging and token pruning.\nLTMP uses learned threshold masking modules that dynamically determine which\ntokens to merge and which to prune. We demonstrate our approach with extensive\nexperiments on vision transformers on the ImageNet classification task. Our\nresults demonstrate that LTMP achieves state-of-the-art accuracy across\nreduction rates while requiring only a single fine-tuning epoch, which is an\norder of magnitude faster than previous methods. Code is available at\nhttps://github.com/Mxbonn/ltmp .\n","authors":["Maxim Bonnaerens","Joni Dambre"],"pdf_url":"https://arxiv.org/pdf/2307.10780v1.pdf","comment":"Paper to be presented at Efficient Systems for Foundation Models\n Workshop at the International Conference on Machine Learning (ICML) 2023"},{"id":"http://arxiv.org/abs/2307.10776v1","updated":"2023-07-20T11:24:55Z","published":"2023-07-20T11:24:55Z","title":"Urban Radiance Field Representation with Deformable Neural Mesh\n Primitives","summary":" Neural Radiance Fields (NeRFs) have achieved great success in the past few\nyears. However, most current methods still require intensive resources due to\nray marching-based rendering. To construct urban-level radiance fields\nefficiently, we design Deformable Neural Mesh Primitive~(DNMP), and propose to\nparameterize the entire scene with such primitives. The DNMP is a flexible and\ncompact neural variant of classic mesh representation, which enjoys both the\nefficiency of rasterization-based rendering and the powerful neural\nrepresentation capability for photo-realistic image synthesis. Specifically, a\nDNMP consists of a set of connected deformable mesh vertices with paired vertex\nfeatures to parameterize the geometry and radiance information of a local area.\nTo constrain the degree of freedom for optimization and lower the storage\nbudgets, we enforce the shape of each primitive to be decoded from a relatively\nlow-dimensional latent space. The rendering colors are decoded from the vertex\nfeatures (interpolated with rasterization) by a view-dependent MLP. The DNMP\nprovides a new paradigm for urban-level scene representation with appealing\nproperties: $(1)$ High-quality rendering. Our method achieves leading\nperformance for novel view synthesis in urban scenarios. $(2)$ Low\ncomputational costs. Our representation enables fast rendering (2.07ms/1k\npixels) and low peak memory usage (110MB/1k pixels). We also present a\nlightweight version that can run 33$\\times$ faster than vanilla NeRFs, and\ncomparable to the highly-optimized Instant-NGP (0.61 vs 0.71ms/1k pixels).\nProject page: \\href{https://dnmp.github.io/}{https://dnmp.github.io/}.\n","authors":["Fan Lu","Yan Xu","Guang Chen","Hongsheng Li","Kwan-Yee Lin","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.10776v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.10768v1","updated":"2023-07-20T10:57:02Z","published":"2023-07-20T10:57:02Z","title":"Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of\n Working Memory","summary":" Working memory (WM), a fundamental cognitive process facilitating the\ntemporary storage, integration, manipulation, and retrieval of information,\nplays a vital role in reasoning and decision-making tasks. Robust benchmark\ndatasets that capture the multifaceted nature of WM are crucial for the\neffective development and evaluation of AI WM models. Here, we introduce a\ncomprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM\ncomprises 10 tasks and a total of 1 million trials, assessing 4\nfunctionalities, 3 domains, and 11 behavioral and neural characteristics of WM.\nWe jointly trained and tested state-of-the-art recurrent neural networks and\ntransformers on all these tasks. We also include human behavioral benchmarks as\nan upper bound for comparison. Our results suggest that AI models replicate\nsome characteristics of WM in the brain, most notably primacy and recency\neffects, and neural clusters and correlates specialized for different domains\nand functionalities of WM. In the experiments, we also reveal some limitations\nin existing models to approximate human behavior. This dataset serves as a\nvaluable resource for communities in cognitive psychology, neuroscience, and\nAI, offering a standardized framework to compare and enhance WM models,\ninvestigate WM's neural underpinnings, and develop WM models with human-like\ncapabilities. Our source code and data are available at\nhttps://github.com/ZhangLab-DeepNeuroCogLab/WorM.\n","authors":["Ankur Sikarwar","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10763v1","updated":"2023-07-20T10:53:12Z","published":"2023-07-20T10:53:12Z","title":"MSQNet: Actor-agnostic Action Recognition with Multi-modal Query","summary":" Existing action recognition methods are typically actor-specific due to the\nintrinsic topological and apparent differences among the actors. This requires\nactor-specific pose estimation (e.g., humans vs. animals), leading to\ncumbersome model design complexity and high maintenance costs. Moreover, they\noften focus on learning the visual modality alone and single-label\nclassification whilst neglecting other available information sources (e.g.,\nclass name text) and the concurrent occurrence of multiple actions. To overcome\nthese limitations, we propose a new approach called 'actor-agnostic multi-modal\nmulti-label action recognition,' which offers a unified solution for various\ntypes of actors, including humans and animals. We further formulate a novel\nMulti-modal Semantic Query Network (MSQNet) model in a transformer-based object\ndetection framework (e.g., DETR), characterized by leveraging visual and\ntextual modalities to represent the action classes better. The elimination of\nactor-specific model designs is a key advantage, as it removes the need for\nactor pose estimation altogether. Extensive experiments on five publicly\navailable benchmarks show that our MSQNet consistently outperforms the prior\narts of actor-specific alternatives on human and animal single- and multi-label\naction recognition tasks by up to 50%. Code will be released at\nhttps://github.com/mondalanindya/MSQNet.\n","authors":["Anindya Mondal","Sauradip Nag","Joaquin M Prada","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10753v1","updated":"2023-07-20T10:29:48Z","published":"2023-07-20T10:29:48Z","title":"LBL: Logarithmic Barrier Loss Function for One-class Classification","summary":" One-class classification (OCC) aims to train a classifier only with the\ntarget class data and attracts great attention for its strong applicability in\nreal-world application. Despite a lot of advances have been made in OCC, it\nstill lacks the effective OCC loss functions for deep learning. In this paper,\na novel logarithmic barrier function based OCC loss (LBL) that assigns large\ngradients to the margin samples and thus derives more compact hypersphere, is\nfirst proposed by approximating the OCC objective smoothly. But the\noptimization of LBL may be instability especially when samples lie on the\nboundary leading to the infinity loss. To address this issue, then, a\nunilateral relaxation Sigmoid function is introduced into LBL and a novel OCC\nloss named LBLSig is proposed. The LBLSig can be seen as the fusion of the mean\nsquare error (MSE) and the cross entropy (CE) and the optimization of LBLSig is\nsmoother owing to the unilateral relaxation Sigmoid function. The effectiveness\nof the proposed LBL and LBLSig is experimentally demonstrated in comparisons\nwith several state-of-the-art OCC algorithms on different network structures.\nThe source code can be found at https://github.com/ML-HDU/LBL_LBLSig.\n","authors":["Tianlei Wang","Dekang Liu","Wandong Zhang","Jiuwen Cao"],"pdf_url":"https://arxiv.org/pdf/2307.10753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13960v2","updated":"2023-07-20T10:26:56Z","published":"2023-06-24T13:29:54Z","title":"Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis","summary":" Regular group convolutional neural networks (G-CNNs) have been shown to\nincrease model performance and improve equivariance to different geometrical\nsymmetries. This work addresses the problem of SE(3), i.e., roto-translation\nequivariance, on volumetric data. Volumetric image data is prevalent in many\nmedical settings. Motivated by the recent work on separable group convolutions,\nwe devise a SE(3) group convolution kernel separated into a continuous SO(3)\n(rotation) kernel and a spatial kernel. We approximate equivariance to the\ncontinuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel\nis parameterized via RBF interpolation on similarly uniform grids. We\ndemonstrate the advantages of our approach in volumetric medical image\nanalysis. Our SE(3) equivariant models consistently outperform CNNs and regular\ndiscrete G-CNNs on challenging medical classification tasks and show\nsignificantly improved generalization capabilities. Our approach achieves up to\na 16.5% gain in accuracy over regular CNNs.\n","authors":["Thijs P. Kuipers","Erik J. Bekkers"],"pdf_url":"https://arxiv.org/pdf/2306.13960v2.pdf","comment":"10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated\n version to camera ready version 1"},{"id":"http://arxiv.org/abs/2307.10745v1","updated":"2023-07-20T10:16:03Z","published":"2023-07-20T10:16:03Z","title":"EdgeAL: An Edge Estimation Based Active Learning Approach for OCT\n Segmentation","summary":" Active learning algorithms have become increasingly popular for training\nmodels with limited data. However, selecting data for annotation remains a\nchallenging problem due to the limited information available on unseen data. To\naddress this issue, we propose EdgeAL, which utilizes the edge information of\nunseen images as {\\it a priori} information for measuring uncertainty. The\nuncertainty is quantified by analyzing the divergence and entropy in model\npredictions across edges. This measure is then used to select superpixels for\nannotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical\nCoherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice\nscore while reducing the annotation label cost to 12%, 2.3%, and 3%,\nrespectively, on three publicly available datasets (Duke, AROI, and UMN). The\nsource code is available at \\url{https://github.com/Mak-Ta-Reque/EdgeAL}\n","authors":["Md Abdul Kadir","Hasan Md Tusfiqur Alam","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.10745v1.pdf","comment":"This version of the contribution has been accepted for publication,\n after peer review (when applicable) but is not the Version of Record and does\n not reflect post-acceptance improvements, or any corrections. Use of this\n Accepted Version is subject to the publisher's Accepted Manuscript terms of\n use\n https://www.springernature.com/gp/open-research/policies/accepted-manuscript-terms"},{"id":"http://arxiv.org/abs/2307.02347v3","updated":"2023-07-20T09:54:41Z","published":"2023-07-05T15:03:10Z","title":"Detecting Images Generated by Deep Diffusion Models using their Local\n Intrinsic Dimensionality","summary":" Diffusion models recently have been successfully applied for the visual\nsynthesis of strikingly realistic appearing images. This raises strong concerns\nabout their potential for malicious purposes. In this paper, we propose using\nthe lightweight multi Local Intrinsic Dimensionality (multiLID), which has been\noriginally developed in context of the detection of adversarial examples, for\nthe automatic detection of synthetic images and the identification of the\naccording generator networks. In contrast to many existing detection\napproaches, which often only work for GAN-generated images, the proposed method\nprovides close to perfect detection results in many realistic use cases.\nExtensive experiments on known and newly created datasets demonstrate that the\nproposed multiLID approach exhibits superiority in diffusion detection and\nmodel identification. Since the empirical evaluations of recent publications on\nthe detection of generated images are often mainly focused on the\n\"LSUN-Bedroom\" dataset, we further establish a comprehensive benchmark for the\ndetection of diffusion-generated images, including samples from several\ndiffusion models with different image sizes.\n","authors":["Peter Lorenz","Ricard Durall","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.02347v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01091v2","updated":"2023-07-20T09:40:13Z","published":"2023-07-03T15:09:32Z","title":"UW-ProCCaps: UnderWater Progressive Colourisation with Capsules","summary":" Underwater images are fundamental for studying and understanding the status\nof marine life. We focus on reducing the memory space required for image\nstorage while the memory space consumption in the collecting phase limits the\ntime lasting of this phase leading to the need for more image collection\ncampaigns. We present a novel machine-learning model that reconstructs the\ncolours of underwater images from their luminescence channel, thus saving 2/3\nof the available storage space. Our model specialises in underwater colour\nreconstruction and consists of an encoder-decoder architecture. The encoder is\ncomposed of a convolutional encoder and a parallel specialised classifier\ntrained with webly-supervised data. The encoder and the decoder use layers of\ncapsules to capture the features of the entities in the image. The colour\nreconstruction process recalls the progressive and the generative adversarial\ntraining procedures. The progressive training gives the ground for a generative\nadversarial routine focused on the refining of colours giving the image bright\nand saturated colours which bring the image back to life. We validate the model\nboth qualitatively and quantitatively on four benchmark datasets. This is the\nfirst attempt at colour reconstruction in greyscale underwater images.\nExtensive results on four benchmark datasets demonstrate that our solution\noutperforms state-of-the-art (SOTA) solutions. We also demonstrate that the\ngenerated colourisation enhances the quality of images compared to enhancement\nmodels at the SOTA.\n","authors":["Rita Pucci","Niki Martinel"],"pdf_url":"https://arxiv.org/pdf/2307.01091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10713v1","updated":"2023-07-20T09:13:32Z","published":"2023-07-20T09:13:32Z","title":"Kick Back & Relax: Learning to Reconstruct the World by Watching SlowTV","summary":" Self-supervised monocular depth estimation (SS-MDE) has the potential to\nscale to vast quantities of data. Unfortunately, existing approaches limit\nthemselves to the automotive domain, resulting in models incapable of\ngeneralizing to complex environments such as natural or indoor settings.\n To address this, we propose a large-scale SlowTV dataset curated from\nYouTube, containing an order of magnitude more data than existing automotive\ndatasets. SlowTV contains 1.7M images from a rich diversity of environments,\nsuch as worldwide seasonal hiking, scenic driving and scuba diving. Using this\ndataset, we train an SS-MDE model that provides zero-shot generalization to a\nlarge collection of indoor/outdoor datasets. The resulting model outperforms\nall existing SSL approaches and closes the gap on supervised SoTA, despite\nusing a more efficient architecture.\n We additionally introduce a collection of best-practices to further maximize\nperformance and zero-shot generalization. This includes 1) aspect ratio\naugmentation, 2) camera intrinsic estimation, 3) support frame randomization\nand 4) flexible motion estimation. Code is available at\nhttps://github.com/jspenmar/slowtv_monodepth.\n","authors":["Jaime Spencer","Chris Russell","Simon Hadfield","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2307.10713v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.10711v1","updated":"2023-07-20T09:06:21Z","published":"2023-07-20T09:06:21Z","title":"AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of\n Diffusion Probabilistic Models","summary":" Existing customization methods require access to multiple reference examples\nto align pre-trained diffusion probabilistic models (DPMs) with user-provided\nconcepts. This paper aims to address the challenge of DPM customization when\nthe only available supervision is a differentiable metric defined on the\ngenerated contents. Since the sampling procedure of DPMs involves recursive\ncalls to the denoising UNet, na\\\"ive gradient backpropagation requires storing\nthe intermediate states of all iterations, resulting in extremely high memory\nconsumption. To overcome this issue, we propose a novel method AdjointDPM,\nwhich first generates new samples from diffusion models by solving the\ncorresponding probability-flow ODEs. It then uses the adjoint sensitivity\nmethod to backpropagate the gradients of the loss to the models' parameters\n(including conditioning signals, network weights, and initial noises) by\nsolving another augmented ODE. To reduce numerical errors in both the forward\ngeneration and gradient backpropagation processes, we further reparameterize\nthe probability-flow ODE and augmented ODE as simple non-stiff ODEs using\nexponential integration. Finally, we demonstrate the effectiveness of\nAdjointDPM on three interesting tasks: converting visual effects into\nidentification text embeddings, finetuning DPMs for specific types of\nstylization, and optimizing initial noise to generate adversarial samples for\nsecurity auditing.\n","authors":["Jiachun Pan","Hanshu Yan","Jun Hao Liew","Vincent Y. F. Tan","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2307.10711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10552v2","updated":"2023-07-20T08:57:20Z","published":"2022-06-21T17:33:53Z","title":"Vicinity Vision Transformer","summary":" Vision transformers have shown great success on numerous computer vision\ntasks. However, its central component, softmax attention, prohibits vision\ntransformers from scaling up to high-resolution images, due to both the\ncomputational complexity and memory footprint being quadratic. Although linear\nattention was introduced in natural language processing (NLP) tasks to mitigate\na similar issue, directly applying existing linear attention to vision\ntransformers may not lead to satisfactory results. We investigate this problem\nand find that computer vision tasks focus more on local information compared\nwith NLP tasks. Based on this observation, we present a Vicinity Attention that\nintroduces a locality bias to vision transformers with linear complexity.\nSpecifically, for each image patch, we adjust its attention weight based on its\n2D Manhattan distance measured by its neighbouring patches. In this case, the\nneighbouring patches will receive stronger attention than far-away patches.\nMoreover, since our Vicinity Attention requires the token length to be much\nlarger than the feature dimension to show its efficiency advantages, we further\npropose a new Vicinity Vision Transformer (VVT) structure to reduce the feature\ndimension without degenerating the accuracy. We perform extensive experiments\non the CIFAR100, ImageNet1K, and ADE20K datasets to validate the effectiveness\nof our method. Our method has a slower growth rate of GFlops than previous\ntransformer-based and convolution-based networks when the input resolution\nincreases. In particular, our approach achieves state-of-the-art image\nclassification accuracy with 50% fewer parameters than previous methods.\n","authors":["Weixuan Sun","Zhen Qin","Hui Deng","Jianyuan Wang","Yi Zhang","Kaihao Zhang","Nick Barnes","Stan Birchfield","Lingpeng Kong","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2206.10552v2.pdf","comment":"code: https://github.com/OpenNLPLab/Vicinity-Vision-Transformer"},{"id":"http://arxiv.org/abs/2307.10705v1","updated":"2023-07-20T08:53:47Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n Lane Segmentation in Self-Driving Cars","summary":" Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10165v2","updated":"2023-07-20T08:53:13Z","published":"2023-07-19T17:46:55Z","title":"Drone navigation and license place detection for vehicle location in\n indoor spaces","summary":" Millions of vehicles are transported every year, tightly parked in vessels or\nboats. To reduce the risks of associated safety issues like fires, knowing the\nlocation of vehicles is essential, since different vehicles may need different\nmitigation measures, e.g. electric cars. This work is aimed at creating a\nsolution based on a nano-drone that navigates across rows of parked vehicles\nand detects their license plates. We do so via a wall-following algorithm, and\na CNN trained to detect license plates. All computations are done in real-time\non the drone, which just sends position and detected images that allow the\ncreation of a 2D map with the position of the plates. Our solution is capable\nof reading all plates across eight test cases (with several rows of plates,\ndifferent drone speeds, or low light) by aggregation of measurements across\nseveral drone journeys.\n","authors":["Moa Arvidsson","Sithichot Sawirot","Cristofer Englund","Fernando Alonso-Fernandez","Martin Torstensson","Boris Duran"],"pdf_url":"https://arxiv.org/pdf/2307.10165v2.pdf","comment":"Published at VIII International Workshop on Artificial Intelligence\n and Pattern Recognition, IWAIPR 2023"},{"id":"http://arxiv.org/abs/2205.09753v2","updated":"2023-07-20T08:41:46Z","published":"2022-04-30T07:08:30Z","title":"HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory\n Prediction via Scene Encoding","summary":" Encoding a driving scene into vector representations has been an essential\ntask for autonomous driving that can benefit downstream tasks e.g. trajectory\nprediction. The driving scene often involves heterogeneous elements such as the\ndifferent types of objects (agents, lanes, traffic signs) and the semantic\nrelations between objects are rich and diverse. Meanwhile, there also exist\nrelativity across elements, which means that the spatial relation is a relative\nconcept and need be encoded in a ego-centric manner instead of in a global\ncoordinate system. Based on these observations, we propose Heterogeneous\nDriving Graph Transformer (HDGT), a backbone modelling the driving scene as a\nheterogeneous graph with different types of nodes and edges. For heterogeneous\ngraph construction, we connect different types of nodes according to diverse\nsemantic relations. For spatial relation encoding, the coordinates of the node\nas well as its in-edges are in the local node-centric coordinate system. For\nthe aggregation module in the graph neural network (GNN), we adopt the\ntransformer structure in a hierarchical way to fit the heterogeneous nature of\ninputs. Experimental results show that HDGT achieves state-of-the-art\nperformance for the task of trajectory prediction, on INTERACTION Prediction\nChallenge and Waymo Open Motion Challenge.\n","authors":["Xiaosong Jia","Penghao Wu","Li Chen","Yu Liu","Hongyang Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2205.09753v2.pdf","comment":"Accepted at IEEE TPAMI in 2023. Code url:\n https://github.com/OpenDriveLab/HDGT"},{"id":"http://arxiv.org/abs/2307.10698v1","updated":"2023-07-20T08:39:20Z","published":"2023-07-20T08:39:20Z","title":"Reverse Knowledge Distillation: Training a Large Model using a Small One\n for Retinal Image Matching on Limited Data","summary":" Retinal image matching plays a crucial role in monitoring disease progression\nand treatment response. However, datasets with matched keypoints between\ntemporally separated pairs of images are not available in abundance to train\ntransformer-based model. We propose a novel approach based on reverse knowledge\ndistillation to train large models with limited data while preventing\noverfitting. Firstly, we propose architectural modifications to a CNN-based\nsemi-supervised method called SuperRetina that help us improve its results on a\npublicly available dataset. Then, we train a computationally heavier model\nbased on a vision transformer encoder using the lighter CNN-based model, which\nis counter-intuitive in the field knowledge-distillation research where\ntraining lighter models based on heavier ones is the norm. Surprisingly, such\nreverse knowledge distillation improves generalization even further. Our\nexperiments suggest that high-dimensional fitting in representation space may\nprevent overfitting unlike training directly to match the final output. We also\nprovide a public dataset with annotations for retinal image keypoint detection\nand matching to help the research community develop algorithms for retinal\nimage applications.\n","authors":["Sahar Almahfouz Nasser","Nihar Gupte","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2307.10698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10697v1","updated":"2023-07-20T08:38:50Z","published":"2023-07-20T08:38:50Z","title":"SqueezerFaceNet: Reducing a Small Face Recognition CNN Even More Via\n Filter Pruning","summary":" The widespread use of mobile devices for various digital services has created\na need for reliable and real-time person authentication. In this context,\nfacial recognition technologies have emerged as a dependable method for\nverifying users due to the prevalence of cameras in mobile devices and their\nintegration into everyday applications. The rapid advancement of deep\nConvolutional Neural Networks (CNNs) has led to numerous face verification\narchitectures. However, these models are often large and impractical for mobile\napplications, reaching sizes of hundreds of megabytes with millions of\nparameters. We address this issue by developing SqueezerFaceNet, a light face\nrecognition network which less than 1M parameters. This is achieved by applying\na network pruning method based on Taylor scores, where filters with small\nimportance scores are removed iteratively. Starting from an already small\nnetwork (of 1.24M) based on SqueezeNet, we show that it can be further reduced\n(up to 40%) without an appreciable loss in performance. To the best of our\nknowledge, we are the first to evaluate network pruning methods for the task of\nface recognition.\n","authors":["Fernando Alonso-Fernandez","Kevin Hernandez-Diaz","Jose Maria Buades Rubio","Josef Bigun"],"pdf_url":"https://arxiv.org/pdf/2307.10697v1.pdf","comment":"Published at VIII International Workshop on Artificial Intelligence\n and Pattern Recognition, IWAIPR 2023"},{"id":"http://arxiv.org/abs/2307.10696v1","updated":"2023-07-20T08:38:15Z","published":"2023-07-20T08:38:15Z","title":"SLPD: Slide-level Prototypical Distillation for WSIs","summary":" Improving the feature representation ability is the foundation of many whole\nslide pathological image (WSIs) tasks. Recent works have achieved great success\nin pathological-specific self-supervised learning (SSL). However, most of them\nonly focus on learning patch-level representations, thus there is still a gap\nbetween pretext and slide-level downstream tasks, e.g., subtyping, grading and\nstaging. Aiming towards slide-level representations, we propose Slide-Level\nPrototypical Distillation (SLPD) to explore intra- and inter-slide semantic\nstructures for context modeling on WSIs. Specifically, we iteratively perform\nintra-slide clustering for the regions (4096x4096 patches) within each WSI to\nyield the prototypes and encourage the region representations to be closer to\nthe assigned prototypes. By representing each slide with its prototypes, we\nfurther select similar slides by the set distance of prototypes and assign the\nregions by cross-slide prototypes for distillation. SLPD achieves\nstate-of-the-art results on multiple slide-level benchmarks and demonstrates\nthat representation learning of semantic structures of slides can make a\nsuitable proxy task for WSI analysis. Code will be available at\nhttps://github.com/Carboxy/SLPD.\n","authors":["Zhimiao Yu","Tiancheng Lin","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2307.10696v1.pdf","comment":"International Conference on Medical Image Computing and Computer\n Assisted Intervention (MICCAI)"},{"id":"http://arxiv.org/abs/2307.10695v1","updated":"2023-07-20T08:38:01Z","published":"2023-07-20T08:38:01Z","title":"Self2Self+: Single-Image Denoising with Self-Supervised Learning and\n Image Quality Assessment Loss","summary":" Recently, denoising methods based on supervised learning have exhibited\npromising performance. However, their reliance on external datasets containing\nnoisy-clean image pairs restricts their applicability. To address this\nlimitation, researchers have focused on training denoising networks using\nsolely a set of noisy inputs. To improve the feasibility of denoising\nprocedures, in this study, we proposed a single-image self-supervised learning\nmethod in which only the noisy input image is used for network training. Gated\nconvolution was used for feature extraction and no-reference image quality\nassessment was used for guiding the training process. Moreover, the proposed\nmethod sampled instances from the input image dataset using Bernoulli sampling\nwith a certain dropout rate for training. The corresponding result was produced\nby averaging the generated predictions from various instances of the trained\nnetwork with dropouts. The experimental results indicated that the proposed\nmethod achieved state-of-the-art denoising performance on both synthetic and\nreal-world datasets. This highlights the effectiveness and practicality of our\nmethod as a potential solution for various noise removal tasks.\n","authors":["Jaekyun Ko","Sanghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10695v1.pdf","comment":"Technical report and supplemantry materials are combined into one\n paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18"},{"id":"http://arxiv.org/abs/2302.08292v3","updated":"2023-07-20T08:35:26Z","published":"2023-02-16T13:41:19Z","title":"Navya3DSeg -- Navya 3D Semantic Segmentation Dataset & split generation\n for autonomous vehicles","summary":" Autonomous driving (AD) perception today relies heavily on deep learning\nbased architectures requiring large scale annotated datasets with their\nassociated costs for curation and annotation. The 3D semantic data are useful\nfor core perception tasks such as obstacle detection and ego-vehicle\nlocalization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg),\nwith a diverse label space corresponding to a large scale production grade\noperational domain, including rural, urban, industrial sites and universities\nfrom 13 countries. It contains 23 labeled sequences and 25 supplementary\nsequences without labels, designed to explore self-supervised and\nsemi-supervised semantic segmentation benchmarks on point clouds. We also\npropose a novel method for sequential dataset split generation based on\niterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU\nimprovement over the original split proposed by SemanticKITTI dataset. A\ncomplete benchmark for semantic segmentation task was performed, with state of\nthe art methods. Finally, we demonstrate an Active Learning (AL) based dataset\ndistillation framework. We introduce a novel heuristic-free sampling method\ncalled ego-pose distance based sampling in the context of AL. A detailed\npresentation on the dataset is available here\nhttps://www.youtube.com/watch?v=5m6ALIs-s20.\n","authors":["Alexandre Almin","Léo Lemarié","Anh Duong","B Ravi Kiran"],"pdf_url":"https://arxiv.org/pdf/2302.08292v3.pdf","comment":"Accepted version to IEEE RA-L. Version with supplementary materials"},{"id":"http://arxiv.org/abs/2307.10685v1","updated":"2023-07-20T08:25:38Z","published":"2023-07-20T08:25:38Z","title":"Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged\n Object Detection","summary":" Camouflaged object detection (COD), aiming to segment camouflaged objects\nwhich exhibit similar patterns with the background, is a challenging task. Most\nexisting works are dedicated to establishing specialized modules to identify\ncamouflaged objects with complete and fine details, while the boundary can not\nbe well located for the lack of object-related semantics. In this paper, we\npropose a novel ``pre-train, adapt and detect\" paradigm to detect camouflaged\nobjects. By introducing a large pre-trained model, abundant knowledge learned\nfrom massive multi-modal data can be directly transferred to COD. A lightweight\nparallel adapter is inserted to adjust the features suitable for the downstream\nCOD task. Extensive experiments on four challenging benchmark datasets\ndemonstrate that our method outperforms existing state-of-the-art COD models by\nlarge margins. Moreover, we design a multi-task learning scheme for tuning the\nadapter to exploit the shareable knowledge across different semantic classes.\nComprehensive experimental results showed that the generalization ability of\nour model can be substantially improved with multi-task adapter initialization\non source tasks and multi-task adaptation on target tasks.\n","authors":["Yinghui Xing","Dexuan Kong","Shizhou Zhang","Geng Chen","Lingyan Ran","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12112v3","updated":"2023-07-20T08:16:09Z","published":"2023-03-21T18:03:14Z","title":"Positive-Augmented Contrastive Learning for Image and Video Captioning\n Evaluation","summary":" The CLIP model has been recently proven to be very effective for a variety of\ncross-modal tasks, including the evaluation of captions generated from\nvision-and-language architectures. In this paper, we propose a new recipe for a\ncontrastive-based evaluation metric for image captioning, namely\nPositive-Augmented Contrastive learning Score (PAC-S), that in a novel way\nunifies the learning of a contrastive visual-semantic space with the addition\nof generated images and text on curated data. Experiments spanning several\ndatasets demonstrate that our new metric achieves the highest correlation with\nhuman judgments on both images and videos, outperforming existing\nreference-based metrics like CIDEr and SPICE and reference-free metrics like\nCLIP-Score. Finally, we test the system-level correlation of the proposed\nmetric when considering popular image captioning approaches, and assess the\nimpact of employing different cross-modal features. Our source code and trained\nmodels are publicly available at: https://github.com/aimagelab/pacscore.\n","authors":["Sara Sarto","Manuele Barraco","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2303.12112v3.pdf","comment":"CVPR 2023 (highlight paper)"},{"id":"http://arxiv.org/abs/2307.05921v3","updated":"2023-07-20T08:14:17Z","published":"2023-07-12T05:36:47Z","title":"Reading Radiology Imaging Like The Radiologist","summary":" Automated radiology report generation aims to generate radiology reports that\ncontain rich, fine-grained descriptions of radiology imaging. Compared with\nimage captioning in the natural image domain, medical images are very similar\nto each other, with only minor differences in the occurrence of diseases. Given\nthe importance of these minor differences in the radiology report, it is\ncrucial to encourage the model to focus more on the subtle regions of disease\noccurrence. Secondly, the problem of visual and textual data biases is serious.\nNot only do normal cases make up the majority of the dataset, but sentences\ndescribing areas with pathological changes also constitute only a small part of\nthe paragraph. Lastly, generating medical image reports involves the challenge\nof long text generation, which requires more expertise and empirical training\nin medical knowledge. As a result, the difficulty of generating such reports is\nincreased. To address these challenges, we propose a disease-oriented retrieval\nframework that utilizes similar reports as prior knowledge references. We\ndesign a factual consistency captioning generator to generate more accurate and\nfactually consistent disease descriptions. Our framework can find most similar\nreports for a given disease from the CXR database by retrieving a\ndisease-oriented mask consisting of the position and morphological\ncharacteristics. By referencing the disease-oriented similar report and the\nvisual features, the factual consistency model can generate a more accurate\nradiology report.\n","authors":["Yuhao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.05921v3.pdf","comment":"There are data writing errors in the paper"},{"id":"http://arxiv.org/abs/2307.10677v1","updated":"2023-07-20T07:57:14Z","published":"2023-07-20T07:57:14Z","title":"Deep learning for classification of noisy QR codes","summary":" We wish to define the limits of a classical classification model based on\ndeep learning when applied to abstract images, which do not represent visually\nidentifiable objects.QR codes (Quick Response codes) fall into this category of\nabstract images: one bit corresponding to one encoded character, QR codes were\nnot designed to be decoded manually. To understand the limitations of a deep\nlearning-based model for abstract image classification, we train an image\nclassification model on QR codes generated from information obtained when\nreading a health pass. We compare a classification model with a classical\n(deterministic) decoding method in the presence of noise. This study allows us\nto conclude that a model based on deep learning can be relevant for the\nunderstanding of abstract images.\n","authors":["Rebecca Leygonie","Sylvain Lobry"," )","Laurent Wendling (LIPADE)"],"pdf_url":"https://arxiv.org/pdf/2307.10677v1.pdf","comment":"in French language. RFIAP 2022 - Reconnaissance des Formes, Image,\n Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France"},{"id":"http://arxiv.org/abs/2307.10667v1","updated":"2023-07-20T07:47:48Z","published":"2023-07-20T07:47:48Z","title":"Efficient Unified Demosaicing for Bayer and Non-Bayer Patterned Image\n Sensors","summary":" As the physical size of recent CMOS image sensors (CIS) gets smaller, the\nlatest mobile cameras are adopting unique non-Bayer color filter array (CFA)\npatterns (e.g., Quad, Nona, QxQ), which consist of homogeneous color units with\nadjacent pixels. These non-Bayer sensors are superior to conventional Bayer CFA\nthanks to their changeable pixel-bin sizes for different light conditions but\nmay introduce visual artifacts during demosaicing due to their inherent pixel\npattern structures and sensor hardware characteristics. Previous demosaicing\nmethods have primarily focused on Bayer CFA, necessitating distinct\nreconstruction methods for non-Bayer patterned CIS with various CFA modes under\ndifferent lighting conditions. In this work, we propose an efficient unified\ndemosaicing method that can be applied to both conventional Bayer RAW and\nvarious non-Bayer CFAs' RAW data in different operation modes. Our Knowledge\nLearning-based demosaicing model for Adaptive Patterns, namely KLAP, utilizes\nCFA-adaptive filters for only 1% key filters in the network for each CFA, but\nstill manages to effectively demosaic all the CFAs, yielding comparable\nperformance to the large-scale models. Furthermore, by employing meta-learning\nduring inference (KLAP-M), our model is able to eliminate unknown\nsensor-generic artifacts in real RAW data, effectively bridging the gap between\nsynthetic images and real sensor RAW. Our KLAP and KLAP-M methods achieved\nstate-of-the-art demosaicing performance in both synthetic and real RAW data of\nBayer and non-Bayer CFAs.\n","authors":["Haechang Lee","Dongwon Park","Wongi Jeong","Kijeong Kim","Hyunwoo Je","Dongil Ryu","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2307.10667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10664v1","updated":"2023-07-20T07:46:34Z","published":"2023-07-20T07:46:34Z","title":"Lighting up NeRF via Unsupervised Decomposition and Enhancement","summary":" Neural Radiance Field (NeRF) is a promising approach for synthesizing novel\nviews, given a set of images and the corresponding camera poses of a scene.\nHowever, images photographed from a low-light scene can hardly be used to train\na NeRF model to produce high-quality results, due to their low pixel\nintensities, heavy noise, and color distortion. Combining existing low-light\nimage enhancement methods with NeRF methods also does not work well due to the\nview inconsistency caused by the individual 2D enhancement process. In this\npaper, we propose a novel approach, called Low-Light NeRF (or LLNeRF), to\nenhance the scene representation and synthesize normal-light novel views\ndirectly from sRGB low-light images in an unsupervised manner. The core of our\napproach is a decomposition of radiance field learning, which allows us to\nenhance the illumination, reduce noise and correct the distorted colors jointly\nwith the NeRF optimization process. Our method is able to produce novel view\nimages with proper lighting and vivid colors and details, given a collection of\ncamera-finished low dynamic range (8-bits/channel) images from a low-light\nscene. Experiments demonstrate that our method outperforms existing low-light\nenhancement methods and NeRF methods.\n","authors":["Haoyuan Wang","Xiaogang Xu","Ke Xu","Rynson WH. Lau"],"pdf_url":"https://arxiv.org/pdf/2307.10664v1.pdf","comment":"ICCV 2023. Project website: https://whyy.site/paper/llnerf"},{"id":"http://arxiv.org/abs/2306.16997v2","updated":"2023-07-20T07:29:03Z","published":"2023-06-29T14:54:10Z","title":"Unsupervised 3D registration through optimization-guided cyclical\n self-training","summary":" State-of-the-art deep learning-based registration methods employ three\ndifferent learning strategies: supervised learning, which requires costly\nmanual annotations, unsupervised learning, which heavily relies on hand-crafted\nsimilarity metrics designed by domain experts, or learning from synthetic data,\nwhich introduces a domain shift. To overcome the limitations of these\nstrategies, we propose a novel self-supervised learning paradigm for\nunsupervised registration, relying on self-training. Our idea is based on two\nkey insights. Feature-based differentiable optimizers 1) perform reasonable\nregistration even from random features and 2) stabilize the training of the\npreceding feature extraction network on noisy labels. Consequently, we propose\ncyclical self-training, where pseudo labels are initialized as the displacement\nfields inferred from random features and cyclically updated based on more and\nmore expressive features from the learning feature extractor, yielding a\nself-reinforcement effect. We evaluate the method for abdomen and lung\nregistration, consistently surpassing metric-based supervision and\noutperforming diverse state-of-the-art competitors. Source code is available at\nhttps://github.com/multimodallearning/reg-cyclical-self-train.\n","authors":["Alexander Bigalke","Lasse Hansen","Tony C. W. Mok","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2306.16997v2.pdf","comment":"accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.10642v1","updated":"2023-07-20T07:12:56Z","published":"2023-07-20T07:12:56Z","title":"RetouchingFFHQ: A Large-scale Dataset for Fine-grained Face Retouching\n Detection","summary":" The widespread use of face retouching filters on short-video platforms has\nraised concerns about the authenticity of digital appearances and the impact of\ndeceptive advertising. To address these issues, there is a pressing need to\ndevelop advanced face retouching techniques. However, the lack of large-scale\nand fine-grained face retouching datasets has been a major obstacle to progress\nin this field. In this paper, we introduce RetouchingFFHQ, a large-scale and\nfine-grained face retouching dataset that contains over half a million\nconditionally-retouched images. RetouchingFFHQ stands out from previous\ndatasets due to its large scale, high quality, fine-grainedness, and\ncustomization. By including four typical types of face retouching operations\nand different retouching levels, we extend the binary face retouching detection\ninto a fine-grained, multi-retouching type, and multi-retouching level\nestimation problem. Additionally, we propose a Multi-granularity Attention\nModule (MAM) as a plugin for CNN backbones for enhanced cross-scale\nrepresentation learning. Extensive experiments using different baselines as\nwell as our proposed method on RetouchingFFHQ show decent performance on face\nretouching detection. With the proposed new dataset, we believe there is great\npotential for future work to tackle the challenging problem of real-world\nfine-grained face retouching detection.\n","authors":["Qichao Ying","Jiaxin Liu","Sheng Li","Haisheng Xu","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10642v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.10638v1","updated":"2023-07-20T07:08:24Z","published":"2023-07-20T07:08:24Z","title":"Quantized Feature Distillation for Network Quantization","summary":" Neural network quantization aims to accelerate and trim full-precision neural\nnetwork models by using low bit approximations. Methods adopting the\nquantization aware training (QAT) paradigm have recently seen a rapid growth,\nbut are often conceptually complicated. This paper proposes a novel and highly\neffective QAT method, quantized feature distillation (QFD). QFD first trains a\nquantized (or binarized) representation as the teacher, then quantize the\nnetwork using knowledge distillation (KD). Quantitative results show that QFD\nis more flexible and effective (i.e., quantization friendly) than previous\nquantization methods. QFD surpasses existing methods by a noticeable margin on\nnot only image classification but also object detection, albeit being much\nsimpler. Furthermore, QFD quantizes ViT and Swin-Transformer on MS-COCO\ndetection and segmentation, which verifies its potential in real world\ndeployment. To the best of our knowledge, this is the first time that vision\ntransformers have been quantized in object detection and image segmentation\ntasks.\n","authors":["Ke Zhu","Yin-Yin He","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10638v1.pdf","comment":"AAAI2023"},{"id":"http://arxiv.org/abs/2305.08396v3","updated":"2023-07-20T07:06:03Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" Convolutional Neural Networks (CNNs) have made significant strides in medical\nimage analysis in recent years. However, the local nature of the convolution\noperator may pose a limitation for capturing global and long-range interactions\nin CNNs. Recently, Transformers have gained popularity in the computer vision\ncommunity and also medical image segmentation due to their ability to process\nglobal features effectively. The scalability issues of self-attention mechanism\nand lack of the CNN-like inductive bias may have limited their adoption.\nTherefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages\nof both Convolution and Self-attention Mechanisms, have gained importance. In\nthis work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with\nnominal computational burden. The inclusion of multi-axis self-attention,\nwithin each decoder stage, significantly enhances the discriminating capacity\nbetween the object and background regions, and thereby helps in improving the\nsegmentation efficiency. In the Hybrid Decoder block, the fusion process\ncommences by integrating the upsampled lower level decoder features, obtained\nthrough transpose convolution, with the skip-connection features derived from\nthe hybrid encoder. Subsequently, the fused features undergo refinement through\nthe utilization of a multi-axis attention mechanism. The proposed decoder block\nis repeated multiple times to progressively segment the nuclei regions.\nExperimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the\neffectiveness of the proposed technique.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10636v1","updated":"2023-07-20T07:04:16Z","published":"2023-07-20T07:04:16Z","title":"Learning and Evaluating Human Preferences for Conversational Head\n Generation","summary":" A reliable and comprehensive evaluation metric that aligns with manual\npreference assessments is crucial for conversational head video synthesis\nmethod development. Existing quantitative evaluations often fail to capture the\nfull complexity of human preference, as they only consider limited evaluation\ndimensions. Qualitative evaluations and user studies offer a solution but are\ntime-consuming and labor-intensive. This limitation hinders the advancement of\nconversational head generation algorithms and systems. In this paper, we\npropose a novel learning-based evaluation metric named Preference Score (PS)\nfor fitting human preference according to the quantitative evaluations across\ndifferent dimensions. PS can serve as a quantitative evaluation without the\nneed for human annotation. Experimental results validate the superiority of\nPreference Score in aligning with human perception, and also demonstrates\nrobustness and generalizability to unseen data, making it a valuable tool for\nadvancing conversation head generation. We expect this metric could facilitate\nnew advances in conversational head generation.\n","authors":["Mohan Zhou","Yalong Bai","Wei Zhang","Ting Yao","Tiejun Zhao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2307.10636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12384v2","updated":"2023-07-20T07:04:04Z","published":"2023-03-22T08:47:37Z","title":"RegFormer: An Efficient Projection-Aware Transformer Network for\n Large-Scale Point Cloud Registration","summary":" Although point cloud registration has achieved remarkable advances in\nobject-level and indoor scenes, large-scale registration methods are rarely\nexplored. Challenges mainly arise from the huge point number, complex\ndistribution, and outliers of outdoor LiDAR scans. In addition, most existing\nregistration works generally adopt a two-stage paradigm: They first find\ncorrespondences by extracting discriminative local features, and then leverage\nestimators (eg. RANSAC) to filter outliers, which are highly dependent on\nwell-designed descriptors and post-processing choices. To address these\nproblems, we propose an end-to-end transformer network (RegFormer) for\nlarge-scale point cloud alignment without any further post-processing.\nSpecifically, a projection-aware hierarchical transformer is proposed to\ncapture long-range dependencies and filter outliers by extracting point\nfeatures globally. Our transformer has linear complexity, which guarantees high\nefficiency even for large-scale scenes. Furthermore, to effectively reduce\nmismatches, a bijective association transformer is designed for regressing the\ninitial transformation. Extensive experiments on KITTI and NuScenes datasets\ndemonstrate that our RegFormer achieves competitive performance in terms of\nboth accuracy and efficiency.\n","authors":["Jiuming Liu","Guangming Wang","Zhe Liu","Chaokang Jiang","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12384v2.pdf","comment":"Accepted by ICCV2023. Codes will be released at\n https://github.com/IRMVLab/RegFormer"},{"id":"http://arxiv.org/abs/2307.10632v1","updated":"2023-07-20T06:58:11Z","published":"2023-07-20T06:58:11Z","title":"Parallelization of a new embedded application for automatic meteor\n detection","summary":" This article presents the methods used to parallelize a new computer vision\napplication. The system is able to automatically detect meteor from\nnon-stabilized cameras and noisy video sequences. The application is designed\nto be embedded in weather balloons or for airborne observation campaigns. Thus,\nthe final target is a low power system-on-chip (< 10 Watts) while the software\nneeds to compute a stream of frames in real-time (> 25 frames per second). For\nthis, first the application is split in a tasks graph, then different\nparallelization techniques are applied. Experiment results demonstrate the\nefficiency of the parallelization methods. For instance, on the Raspberry Pi 4\nand on a HD video sequence, the processing chain reaches 42 frames per second\nwhile it only consumes 6 Watts.\n","authors":["Mathuran Kandeepan","Clara Ciocan","Adrien Cassagne","Lionel Lacassagne"],"pdf_url":"https://arxiv.org/pdf/2307.10632v1.pdf","comment":"in French language, COMPAS 2023 - Conf{\\'e}rence francophone\n d'informatique en Parall{\\'e}lisme, Architecture et Syst{\\`e}me, Jul 2023,\n Annecy (France), France"},{"id":"http://arxiv.org/abs/2307.10625v1","updated":"2023-07-20T06:47:46Z","published":"2023-07-20T06:47:46Z","title":"Learning Discriminative Visual-Text Representation for Polyp\n Re-Identification","summary":" Colonoscopic Polyp Re-Identification aims to match a specific polyp in a\nlarge gallery with different cameras and views, which plays a key role for the\nprevention and treatment of colorectal cancer in the computer-aided diagnosis.\nHowever, traditional methods mainly focus on the visual representation\nlearning, while neglect to explore the potential of semantic features during\ntraining, which may easily leads to poor generalization capability when adapted\nthe pretrained model into the new scenarios. To relieve this dilemma, we\npropose a simple but effective training method named VT-ReID, which can\nremarkably enrich the representation of polyp videos with the interchange of\nhigh-level semantic information. Moreover, we elaborately design a novel\nclustering mechanism to introduce prior knowledge from textual data, which\nleverages contrastive learning to promote better separation from abundant\nunlabeled text data. To the best of our knowledge, this is the first attempt to\nemploy the visual-text feature with clustering mechanism for the colonoscopic\npolyp re-identification. Empirical results show that our method significantly\noutperforms current state-of-the art methods with a clear margin.\n","authors":["Suncheng Xiang","Cang Liu","Sijia Du","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2307.10625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10624v1","updated":"2023-07-20T06:44:42Z","published":"2023-07-20T06:44:42Z","title":"Joint Skeletal and Semantic Embedding Loss for Micro-gesture\n Classification","summary":" In this paper, we briefly introduce the solution of our team HFUT-VUT for the\nMicros-gesture Classification in the MiGA challenge at IJCAI 2023. The\nmicro-gesture classification task aims at recognizing the action category of a\ngiven video based on the skeleton data. For this task, we propose a\n3D-CNNs-based micro-gesture recognition network, which incorporates a skeletal\nand semantic embedding loss to improve action classification performance.\nFinally, we rank 1st in the Micro-gesture Classification Challenge, surpassing\nthe second-place team in terms of Top-1 accuracy by 1.10%.\n","authors":["Kun Li","Dan Guo","Guoliang Chen","Xinge Peng","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10624v1.pdf","comment":"1st Place in Micro-gesture Classification sub-challenge in MiGA at\n IJCAI-2023"},{"id":"http://arxiv.org/abs/2211.14085v3","updated":"2023-07-20T06:42:56Z","published":"2022-11-25T13:14:33Z","title":"Positive unlabeled learning with tensor networks","summary":" Positive unlabeled learning is a binary classification problem with positive\nand unlabeled data. It is common in domains where negative labels are costly or\nimpossible to obtain, e.g., medicine and personalized advertising. Most\napproaches to positive unlabeled learning apply to specific data types (e.g.,\nimages, categorical data) and can not generate new positive and negative\nsamples. This work introduces a feature-space distance-based tensor network\napproach to the positive unlabeled learning problem. The presented method is\nnot domain specific and significantly improves the state-of-the-art results on\nthe MNIST image and 15 categorical/mixed datasets. The trained tensor network\nmodel is also a generative model and enables the generation of new positive and\nnegative instances.\n","authors":["Bojan Žunkovič"],"pdf_url":"https://arxiv.org/pdf/2211.14085v3.pdf","comment":"12 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.10620v1","updated":"2023-07-20T06:37:47Z","published":"2023-07-20T06:37:47Z","title":"Quaternion tensor ring decomposition and application for color image\n inpainting","summary":" In recent years, tensor networks have emerged as powerful tools for solving\nlarge-scale optimization problems. One of the most promising tensor networks is\nthe tensor ring (TR) decomposition, which achieves circular dimensional\npermutation invariance in the model through the utilization of the trace\noperation and equitable treatment of the latent cores. On the other hand, more\nrecently, quaternions have gained significant attention and have been widely\nutilized in color image processing tasks due to their effectiveness in encoding\ncolor pixels. Therefore, in this paper, we propose the quaternion tensor ring\n(QTR) decomposition, which inherits the powerful and generalized representation\nabilities of the TR decomposition while leveraging the advantages of\nquaternions for color pixel representation. In addition to providing the\ndefinition of QTR decomposition and an algorithm for learning the QTR format,\nthis paper also proposes a low-rank quaternion tensor completion (LRQTC) model\nand its algorithm for color image inpainting based on the QTR decomposition.\nFinally, extensive experiments on color image inpainting demonstrate that the\nproposed QTLRC method is highly competitive.\n","authors":["Jifei Miao","Kit Ian Kou"],"pdf_url":"https://arxiv.org/pdf/2307.10620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10616v1","updated":"2023-07-20T06:32:14Z","published":"2023-07-20T06:32:14Z","title":"Heterogeneous Federated Learning: State-of-the-art and Research\n Challenges","summary":" Federated learning (FL) has drawn increasing attention owing to its potential\nuse in large-scale industrial applications. Existing federated learning works\nmainly focus on model homogeneous settings. However, practical federated\nlearning typically faces the heterogeneity of data distributions, model\narchitectures, network environments, and hardware devices among participant\nclients. Heterogeneous Federated Learning (HFL) is much more challenging, and\ncorresponding solutions are diverse and complex. Therefore, a systematic survey\non this topic about the research challenges and state-of-the-art is essential.\nIn this survey, we firstly summarize the various research challenges in HFL\nfrom five aspects: statistical heterogeneity, model heterogeneity,\ncommunication heterogeneity, device heterogeneity, and additional challenges.\nIn addition, recent advances in HFL are reviewed and a new taxonomy of existing\nHFL methods is proposed with an in-depth analysis of their pros and cons. We\nclassify existing methods from three different levels according to the HFL\nprocedure: data-level, model-level, and server-level. Finally, several critical\nand promising future research directions in HFL are discussed, which may\nfacilitate further developments in this field. A periodically updated\ncollection on HFL is available at https://github.com/marswhu/HFL_Survey.\n","authors":["Mang Ye","Xiuwen Fang","Bo Du","Pong C. Yuen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.10616v1.pdf","comment":"42 pages, 11 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2307.10609v1","updated":"2023-07-20T06:07:09Z","published":"2023-07-20T06:07:09Z","title":"Hybrid Feature Embedding For Automatic Building Outline Extraction","summary":" Building outline extracted from high-resolution aerial images can be used in\nvarious application fields such as change detection and disaster assessment.\nHowever, traditional CNN model cannot recognize contours very precisely from\noriginal images. In this paper, we proposed a CNN and Transformer based model\ntogether with active contour model to deal with this problem. We also designed\na triple-branch decoder structure to handle different features generated by\nencoder. Experiment results show that our model outperforms other baseline\nmodel on two datasets, achieving 91.1% mIoU on Vaihingen and 83.8% on Bing\nhuts.\n","authors":["Weihang Ran","Wei Yuan","Xiaodan Shi","Zipei Fan","Ryosuke Shibasaki"],"pdf_url":"https://arxiv.org/pdf/2307.10609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10603v1","updated":"2023-07-20T05:49:21Z","published":"2023-07-20T05:49:21Z","title":"Physics-Driven Turbulence Image Restoration with Stochastic Refinement","summary":" Image distortion by atmospheric turbulence is a stochastic degradation, which\nis a critical problem in long-range optical imaging systems. A number of\nresearch has been conducted during the past decades, including model-based and\nemerging deep-learning solutions with the help of synthetic data. Although fast\nand physics-grounded simulation tools have been introduced to help the\ndeep-learning models adapt to real-world turbulence conditions recently, the\ntraining of such models only relies on the synthetic data and ground truth\npairs. This paper proposes the Physics-integrated Restoration Network (PiRN) to\nbring the physics-based simulator directly into the training process to help\nthe network to disentangle the stochasticity from the degradation and the\nunderlying image. Furthermore, to overcome the ``average effect\" introduced by\ndeterministic models and the domain gap between the synthetic and real-world\ndegradation, we further introduce PiRN with Stochastic Refinement (PiRN-SR) to\nboost its perceptual quality. Overall, our PiRN and PiRN-SR improve the\ngeneralization to real-world unknown turbulence conditions and provide a\nstate-of-the-art restoration in both pixel-wise accuracy and perceptual\nquality. Our codes are available at \\url{https://github.com/VITA-Group/PiRN}.\n","authors":["Ajay Jaiswal","Xingguang Zhang","Stanley H. Chan","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10603v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10601v1","updated":"2023-07-20T05:46:32Z","published":"2023-07-20T05:46:32Z","title":"SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and\n Multi-View for 3D Object Retrieval","summary":" To address 3D object retrieval, substantial efforts have been made to\ngenerate highly discriminative descriptors of 3D objects represented by a\nsingle modality, e.g., voxels, point clouds or multi-view images. It is\npromising to leverage the complementary information from multi-modality\nrepresentations of 3D objects to further improve retrieval performance.\nHowever, multi-modality 3D object retrieval is rarely developed and analyzed on\nlarge-scale datasets. In this paper, we propose self-and-cross attention based\naggregation of point cloud and multi-view images (SCA-PVNet) for 3D object\nretrieval. With deep features extracted from point clouds and multi-view\nimages, we design two types of feature aggregation modules, namely the\nIn-Modality Aggregation Module (IMAM) and the Cross-Modality Aggregation Module\n(CMAM), for effective feature fusion. IMAM leverages a self-attention mechanism\nto aggregate multi-view features while CMAM exploits a cross-attention\nmechanism to interact point cloud features with multi-view features. The final\ndescriptor of a 3D object for object retrieval can be obtained via\nconcatenating the aggregated features from both modules. Extensive experiments\nand analysis are conducted on three datasets, ranging from small to large\nscale, to show the superiority of the proposed SCA-PVNet over the\nstate-of-the-art methods.\n","authors":["Dongyun Lin","Yi Cheng","Aiyuan Guo","Shangbo Mao","Yiqun Li"],"pdf_url":"https://arxiv.org/pdf/2307.10601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01928v3","updated":"2023-07-20T05:21:04Z","published":"2023-01-05T06:32:50Z","title":"Event Camera Data Pre-training","summary":" This paper proposes a pre-trained neural network for handling event camera\ndata. Our model is a self-supervised learning framework, and uses paired event\ncamera data and natural RGB images for training.\n Our method contains three modules connected in a sequence: i) a family of\nevent data augmentations, generating meaningful event images for\nself-supervised training; ii) a conditional masking strategy to sample\ninformative event patches from event images, encouraging our model to capture\nthe spatial layout of a scene and accelerating training; iii) a contrastive\nlearning approach, enforcing the similarity of embeddings between matching\nevent images, and between paired event and RGB images. An embedding projection\nloss is proposed to avoid the model collapse when enforcing the event image\nembedding similarities. A probability distribution alignment loss is proposed\nto encourage the event image to be consistent with its paired RGB image in the\nfeature space.\n Transfer learning performance on downstream tasks shows the superiority of\nour method over state-of-the-art methods. For example, we achieve top-1\naccuracy at 64.83% on the N-ImageNet dataset.\n","authors":["Yan Yang","Liyuan Pan","Liu Liu"],"pdf_url":"https://arxiv.org/pdf/2301.01928v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10593v1","updated":"2023-07-20T05:15:03Z","published":"2023-07-20T05:15:03Z","title":"Event Blob Tracking: An Asynchronous Real-Time Algorithm","summary":" Event-based cameras have become increasingly popular for tracking fast-moving\nobjects due to their high temporal resolution, low latency, and high dynamic\nrange. In this paper, we propose a novel algorithm for tracking event blobs\nusing raw events asynchronously in real time. We introduce the concept of an\nevent blob as a spatio-temporal likelihood of event occurrence where the\nconditional spatial likelihood is blob-like. Many real-world objects generate\nevent blob data, for example, flickering LEDs such as car headlights or any\nsmall foreground object moving against a static or slowly varying background.\nThe proposed algorithm uses a nearest neighbour classifier with a dynamic\nthreshold criteria for data association coupled with a Kalman filter to track\nthe event blob state. Our algorithm achieves highly accurate tracking and event\nblob shape estimation even under challenging lighting conditions and high-speed\nmotions. The microsecond time resolution achieved means that the filter output\ncan be used to derive secondary information such as time-to-contact or range\nestimation, that will enable applications to real-world problems such as\ncollision avoidance in autonomous driving.\n","authors":["Ziwei Wang","Timothy Molloy","Pieter van Goor","Robert Mahony"],"pdf_url":"https://arxiv.org/pdf/2307.10593v1.pdf","comment":"17 pages, 8 figures, preprint version"},{"id":"http://arxiv.org/abs/2210.06551v4","updated":"2023-07-20T04:59:45Z","published":"2022-10-12T19:46:25Z","title":"MotionBERT: A Unified Perspective on Learning Human Motion\n Representations","summary":" We present a unified perspective on tackling various human-centric video\ntasks by learning human motion representations from large-scale and\nheterogeneous data resources. Specifically, we propose a pretraining stage in\nwhich a motion encoder is trained to recover the underlying 3D motion from\nnoisy partial 2D observations. The motion representations acquired in this way\nincorporate geometric, kinematic, and physical knowledge about human motion,\nwhich can be easily transferred to multiple downstream tasks. We implement the\nmotion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)\nneural network. It could capture long-range spatio-temporal relationships among\nthe skeletal joints comprehensively and adaptively, exemplified by the lowest\n3D pose estimation error so far when trained from scratch. Furthermore, our\nproposed framework achieves state-of-the-art performance on all three\ndownstream tasks by simply finetuning the pretrained motion encoder with a\nsimple regression head (1-2 layers), which demonstrates the versatility of the\nlearned motion representations. Code and models are available at\nhttps://motionbert.github.io/\n","authors":["Wentao Zhu","Xiaoxuan Ma","Zhaoyang Liu","Libin Liu","Wayne Wu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2210.06551v4.pdf","comment":"ICCV 2023 version"},{"id":"http://arxiv.org/abs/2307.10584v1","updated":"2023-07-20T04:51:10Z","published":"2023-07-20T04:51:10Z","title":"Reference-based Painterly Inpainting via Diffusion: Crossing the Wild\n Reference Domain Gap","summary":" Have you ever imagined how it would look if we placed new objects into\npaintings? For example, what would it look like if we placed a basketball into\nClaude Monet's ``Water Lilies, Evening Effect''? We propose Reference-based\nPainterly Inpainting, a novel task that crosses the wild reference domain gap\nand implants novel objects into artworks. Although previous works have examined\nreference-based inpainting, they are not designed for large domain\ndiscrepancies between the target and the reference, such as inpainting an\nartistic image using a photorealistic reference. This paper proposes a novel\ndiffusion framework, dubbed RefPaint, to ``inpaint more wildly'' by taking such\nreferences with large domain gaps. Built with an image-conditioned diffusion\nmodel, we introduce a ladder-side branch and a masked fusion mechanism to work\nwith the inpainting mask. By decomposing the CLIP image embeddings at inference\ntime, one can manipulate the strength of semantic and style information with\nease. Experiments demonstrate that our proposed RefPaint framework produces\nsignificantly better results than existing methods. Our method enables creative\npainterly image inpainting with reference objects that would otherwise be\ndifficult to achieve. Project page: https://vita-group.github.io/RefPaint/\n","authors":["Dejia Xu","Xingqian Xu","Wenyan Cong","Humphrey Shi","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10577v1","updated":"2023-07-20T04:41:39Z","published":"2023-07-20T04:41:39Z","title":"Ethosight: A Joint-Embedding Based System for Nuanced Perception Using\n Contextual Label Affinity Metric and Reasoning Based Iterative Learning","summary":" Traditional computer vision models often require extensive manual effort for\ndata acquisition and validation, particularly when detecting subtle behavioral\nnuances or events. The difficulty in distinguishing routine behaviors from\npotential risks in real-world applications, like differentiating routine\nshopping from potential shoplifting, further complicates the process.\n We present Ethosight, a novel zero-shot computer vision algorithm. Ethosight\neradicates the need for pre-existing symbolic knowledge, initiating from a\nclean slate based on user requirements and semantic knowledge of interest.\nUsing localized label affinity calculations and a reasoning-guided iterative\nlearning loop, Ethosight infers scene details and iteratively refines the label\nset. Reasoning mechanisms can be derived from large language models like GPT4,\nsymbolic reasoners like OpenNARS, or hybrid systems.\n Ethosight further capitalizes on the capabilities of a pre-trained\nmulti-modal model, ImageBind, generating accurate semantic knowledge of images\nwithin a few cycles. It successfully captures both explicit and nuanced\nelements efficiently. We also introduce the implementation of Korzybski's\n\"time-binding\" concept in machines, which allows for generational learning and\nknowledge sharing across deployments.\n Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases.\nIt has exhibited an exceptional ability to discern new areas of interest,\nconsistently generating high-affinity scores within the top five labels from a\nset of a thousand. Tests conducted across diverse environments attest to\nEthosight's robust performance. Detailed results and case studies within the\nmain body of this paper and an appendix underscore a promising trajectory\ntowards enhancing the adaptability and resilience of computer vision models in\ndetecting and extracting subtle and nuanced behaviors.\n","authors":["Hugo Latapie","Kristinn R. Thorisson","Shan Yu","Vahagn Petrosyan","Patrick Hammer","Pei Wang","Brandon Kynoch","Hanning Chen","Tangrui Li"],"pdf_url":"https://arxiv.org/pdf/2307.10577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10575v1","updated":"2023-07-20T04:35:50Z","published":"2023-07-20T04:35:50Z","title":"Boosting Federated Learning Convergence with Prototype Regularization","summary":" As a distributed machine learning technique, federated learning (FL) requires\nclients to collaboratively train a shared model with an edge server without\nleaking their local data. However, the heterogeneous data distribution among\nclients often leads to a decrease in model performance. To tackle this issue,\nthis paper introduces a prototype-based regularization strategy to address the\nheterogeneity in the data distribution. Specifically, the regularization\nprocess involves the server aggregating local prototypes from distributed\nclients to generate a global prototype, which is then sent back to the\nindividual clients to guide their local training. The experimental results on\nMNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3%\nand 8.9% in average test accuracy, respectively, compared to the most popular\nbaseline FedAvg. Furthermore, our approach has a fast convergence rate in\nheterogeneous settings.\n","authors":["Yu Qiao","Huy Q. Le","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2307.10575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04247v2","updated":"2023-07-20T04:28:36Z","published":"2023-05-07T11:18:39Z","title":"Estimation of control area in badminton doubles with pose information\n from top and back view drone videos","summary":" The application of visual tracking to the performance analysis of sports\nplayers in dynamic competitions is vital for effective coaching. In doubles\nmatches, coordinated positioning is crucial for maintaining control of the\ncourt and minimizing opponents' scoring opportunities. The analysis of such\nteamwork plays a vital role in understanding the dynamics of the game. However,\nprevious studies have primarily focused on analyzing and assessing singles\nplayers without considering occlusion in broadcast videos. These studies have\nrelied on discrete representations, which involve the analysis and\nrepresentation of specific actions (e.g., strokes) or events that occur during\nthe game while overlooking the meaningful spatial distribution. In this work,\nwe present the first annotated drone dataset from top and back views in\nbadminton doubles and propose a framework to estimate the control area\nprobability map, which can be used to evaluate teamwork performance. We present\nan efficient framework of deep neural networks that enables the calculation of\nfull probability surfaces. This framework utilizes the embedding of a Gaussian\nmixture map of players' positions and employs graph convolution on their poses.\nIn the experiment, we verify our approach by comparing various baselines and\ndiscovering the correlations between the score and control area. Additionally,\nwe propose a practical application for assessing optimal positioning to provide\ninstructions during a game. Our approach offers both visual and quantitative\nevaluations of players' movements, thereby providing valuable insights into\ndoubles teamwork. The dataset and related project code is available at\nhttps://github.com/Ning-D/Drone_BD_ControlArea\n","authors":["Ning Ding","Kazuya Takeda","Wenhui Jin","Yingjiu Bei","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2305.04247v2.pdf","comment":"15 pages, 10 figures, to appear in Multimedia Tools and Applications"},{"id":"http://arxiv.org/abs/2307.10036v2","updated":"2023-07-20T04:26:46Z","published":"2023-07-19T15:19:02Z","title":"Class Attention to Regions of Lesion for Imbalanced Medical Image\n Recognition","summary":" Automated medical image classification is the key component in intelligent\ndiagnosis systems. However, most medical image datasets contain plenty of\nsamples of common diseases and just a handful of rare ones, leading to major\nclass imbalances. Currently, it is an open problem in intelligent diagnosis to\neffectively learn from imbalanced training data. In this paper, we propose a\nsimple yet effective framework, named \\textbf{C}lass \\textbf{A}ttention to\n\\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by\nembedding attention into the training process of \\textbf{C}onvolutional\n\\textbf{N}eural \\textbf{N}etworks (CNNs). The proposed attention module helps\nCNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn\ntheir characteristics more effectively. In addition, this attention module\nworks only during the training phase and does not change the architecture of\nthe original network, so it can be directly combined with any existing CNN\narchitecture. The CARE framework needs bounding boxes to represent the lesion\nregions of rare diseases. To alleviate the need for manual annotation, we\nfurther developed variants of CARE by leveraging the traditional saliency\nmethods or a pretrained segmentation model for bounding box generation. Results\nshow that the CARE variants with automated bounding box generation are\ncomparable to the original CARE framework with \\textit{manual} bounding box\nannotations. A series of experiments on an imbalanced skin image dataset and a\npneumonia dataset indicates that our method can effectively help the network\nfocus on the lesion regions of rare diseases and remarkably improves the\nclassification performance of rare diseases.\n","authors":["Jia-Xin Zhuang","Jiabin Cai","Jianguo Zhang","Wei-shi Zheng","Ruixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10036v2.pdf","comment":"Accepted by Neurocomputing on July 2023. 37 pages"},{"id":"http://arxiv.org/abs/2307.09724v2","updated":"2023-07-20T04:14:01Z","published":"2023-07-19T02:26:20Z","title":"AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks","summary":" To deliver the artistic expression of the target style, recent studies\nexploit the attention mechanism owing to its ability to map the local patches\nof the style image to the corresponding patches of the content image. However,\nbecause of the low semantic correspondence between arbitrary content and\nartworks, the attention module repeatedly abuses specific local patches from\nthe style image, resulting in disharmonious and evident repetitive artifacts.\nTo overcome this limitation and accomplish impeccable artistic style transfer,\nwe focus on enhancing the attention mechanism and capturing the rhythm of\npatterns that organize the style. In this paper, we introduce a novel metric,\nnamely pattern repeatability, that quantifies the repetition of patterns in the\nstyle image. Based on the pattern repeatability, we propose Aesthetic\nPattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot\nof local and global style expressions. In addition, we propose a novel\nself-supervisory task to encourage the attention mechanism to learn precise and\nmeaningful semantic correspondence. Lastly, we introduce the patch-wise style\nloss to transfer the elaborate rhythm of local patterns. Through qualitative\nand quantitative evaluations, we verify the reliability of the proposed pattern\nrepeatability that aligns with human perception, and demonstrate the\nsuperiority of the proposed framework.\n","authors":["Kibeom Hong","Seogkyu Jeon","Junsoo Lee","Namhyuk Ahn","Kunhee Kim","Pilhyeon Lee","Daesik Kim","Youngjung Uh","Hyeran Byun"],"pdf_url":"https://arxiv.org/pdf/2307.09724v2.pdf","comment":"Accepted by ICCV 2023. Code is available at this\n https://github.com/Kibeom-Hong/AesPA-Net"},{"id":"http://arxiv.org/abs/2307.10567v1","updated":"2023-07-20T04:12:10Z","published":"2023-07-20T04:12:10Z","title":"No-frills Temporal Video Grounding: Multi-Scale Neighboring Attention\n and Zoom-in Boundary Detection","summary":" Temporal video grounding (TVG) aims to retrieve the time interval of a\nlanguage query from an untrimmed video. A significant challenge in TVG is the\nlow \"Semantic Noise Ratio (SNR)\", which results in worse performance with lower\nSNR. Prior works have addressed this challenge using sophisticated techniques.\nIn this paper, we propose a no-frills TVG model that consists of two core\nmodules, namely multi-scale neighboring attention and zoom-in boundary\ndetection. The multi-scale neighboring attention restricts each video token to\nonly aggregate visual contexts from its neighbor, enabling the extraction of\nthe most distinguishing information with multi-scale feature hierarchies from\nhigh-ratio noises. The zoom-in boundary detection then focuses on local-wise\ndiscrimination of the selected top candidates for fine-grained grounding\nadjustment. With an end-to-end training strategy, our model achieves\ncompetitive performance on different TVG benchmarks, while also having the\nadvantage of faster inference speed and lighter model parameters, thanks to its\nlightweight architecture.\n","authors":["Qi Zhang","Sipeng Zheng","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2307.10567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14795v2","updated":"2023-07-20T03:39:19Z","published":"2023-06-26T15:53:02Z","title":"MotionGPT: Human Motion as a Foreign Language","summary":" Though the advancement of pre-trained large language models unfolds, the\nexploration of building a unified model for language and other multi-modal\ndata, such as motion, remains challenging and untouched so far. Fortunately,\nhuman motion displays a semantic coupling akin to human language, often\nperceived as a form of body language. By fusing language data with large-scale\nmotion models, motion-language pre-training that can enhance the performance of\nmotion-related tasks becomes feasible. Driven by this insight, we propose\nMotionGPT, a unified, versatile, and user-friendly motion-language model to\nhandle multiple motion-relevant tasks. Specifically, we employ the discrete\nvector quantization for human motion and transfer 3D motion into motion tokens,\nsimilar to the generation process of word tokens. Building upon this \"motion\nvocabulary\", we perform language modeling on both motion and text in a unified\nmanner, treating human motion as a specific language. Moreover, inspired by\nprompt learning, we pre-train MotionGPT with a mixture of motion-language data\nand fine-tune it on prompt-based question-and-answer tasks. Extensive\nexperiments demonstrate that MotionGPT achieves state-of-the-art performances\non multiple motion tasks including text-driven motion generation, motion\ncaptioning, motion prediction, and motion in-between.\n","authors":["Biao Jiang","Xin Chen","Wen Liu","Jingyi Yu","Gang Yu","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.14795v2.pdf","comment":"Project Page: https://github.com/OpenMotionLab/MotionGPT"},{"id":"http://arxiv.org/abs/2307.10554v1","updated":"2023-07-20T03:36:13Z","published":"2023-07-20T03:36:13Z","title":"EMQ: Evolving Training-free Proxies for Automated Mixed Precision\n Quantization","summary":" Mixed-Precision Quantization~(MQ) can achieve a competitive\naccuracy-complexity trade-off for models. Conventional training-based search\nmethods require time-consuming candidate training to search optimized per-layer\nbit-width configurations in MQ. Recently, some training-free approaches have\npresented various MQ proxies and significantly improve search efficiency.\nHowever, the correlation between these proxies and quantization accuracy is\npoorly understood. To address the gap, we first build the MQ-Bench-101, which\ninvolves different bit configurations and quantization results. Then, we\nobserve that the existing training-free proxies perform weak correlations on\nthe MQ-Bench-101. To efficiently seek superior proxies, we develop an automatic\nsearch of proxies framework for MQ via evolving algorithms. In particular, we\ndevise an elaborate search space involving the existing proxies and perform an\nevolution search to discover the best correlated MQ proxy. We proposed a\ndiversity-prompting selection strategy and compatibility screening protocol to\navoid premature convergence and improve search efficiency. In this way, our\nEvolving proxies for Mixed-precision Quantization~(EMQ) framework allows the\nauto-generation of proxies without heavy tuning and expert knowledge. Extensive\nexperiments on ImageNet with various ResNet and MobileNet families demonstrate\nthat our EMQ obtains superior performance than state-of-the-art mixed-precision\nmethods at a significantly reduced cost. The code will be released.\n","authors":["Peijie Dong","Lujun Li","Zimian Wei","Xin Niu","Zhiliang Tian","Hengyue Pan"],"pdf_url":"https://arxiv.org/pdf/2307.10554v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.10549v1","updated":"2023-07-20T03:26:57Z","published":"2023-07-20T03:26:57Z","title":"Dynamic Large Language Models on Blockchains","summary":" Training and deploying the large language models requires a large mount of\ncomputational resource because the language models contain billions of\nparameters and the text has thousands of tokens. Another problem is that the\nlarge language models are static. They are fixed after the training process. To\ntackle these issues, in this paper, we propose to train and deploy the dynamic\nlarge language model on blockchains, which have high computation performance\nand are distributed across a network of computers. A blockchain is a secure,\ndecentralized, and transparent system that allows for the creation of a\ntamper-proof ledger for transactions without the need for intermediaries. The\ndynamic large language models can continuously learn from the user input after\nthe training process. Our method provides a new way to develop the large\nlanguage models and also sheds a light on the next generation artificial\nintelligence systems.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2307.10549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.04550v5","updated":"2023-07-20T02:00:22Z","published":"2021-06-08T17:39:14Z","title":"DETReg: Unsupervised Pretraining with Region Priors for Object Detection","summary":" Recent self-supervised pretraining methods for object detection largely focus\non pretraining the backbone of the object detector, neglecting key parts of\ndetection architecture. Instead, we introduce DETReg, a new self-supervised\nmethod that pretrains the entire object detection network, including the object\nlocalization and embedding components. During pretraining, DETReg predicts\nobject localizations to match the localizations from an unsupervised region\nproposal generator and simultaneously aligns the corresponding feature\nembeddings with embeddings from a self-supervised image encoder. We implement\nDETReg using the DETR family of detectors and show that it improves over\ncompetitive baselines when finetuned on COCO, PASCAL VOC, and Airbus Ship\nbenchmarks. In low-data regimes DETReg achieves improved performance, e.g.,\nwhen training with only 1% of the labels and in the few-shot learning settings.\n","authors":["Amir Bar","Xin Wang","Vadim Kantorov","Colorado J Reed","Roei Herzig","Gal Chechik","Anna Rohrbach","Trevor Darrell","Amir Globerson"],"pdf_url":"https://arxiv.org/pdf/2106.04550v5.pdf","comment":"Project page: https://www.amirbar.net/detreg/"},{"id":"http://arxiv.org/abs/2307.10518v1","updated":"2023-07-20T01:37:32Z","published":"2023-07-20T01:37:32Z","title":"Interactive Segmentation for Diverse Gesture Types Without Context","summary":" Interactive segmentation entails a human marking an image to guide how a\nmodel either creates or edits a segmentation. Our work addresses limitations of\nexisting methods: they either only support one gesture type for marking an\nimage (e.g., either clicks or scribbles) or require knowledge of the gesture\ntype being employed, and require specifying whether marked regions should be\nincluded versus excluded in the final segmentation. We instead propose a\nsimplified interactive segmentation task where a user only must mark an image,\nwhere the input can be of any gesture type without specifying the gesture type.\nWe support this new task by introducing the first interactive segmentation\ndataset with multiple gesture types as well as a new evaluation metric capable\nof holistically evaluating interactive segmentation algorithms. We then analyze\nnumerous interactive segmentation algorithms, including ones adapted for our\nnovel task. While we observe promising performance overall, we also highlight\nareas for future improvement. To facilitate further extensions of this work, we\npublicly share our new dataset at https://github.com/joshmyersdean/dig.\n","authors":["Josh Myers-Dean","Yifei Fan","Brian Price","Wilson Chan","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2307.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08015v3","updated":"2023-07-20T01:11:21Z","published":"2023-07-16T11:52:27Z","title":"Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via\n Geometry-Guided Cross-View Transformer","summary":" Image retrieval-based cross-view localization methods often lead to very\ncoarse camera pose estimation, due to the limited sampling density of the\ndatabase satellite images. In this paper, we propose a method to increase the\naccuracy of a ground camera's location and orientation by estimating the\nrelative rotation and translation between the ground-level image and its\nmatched/retrieved satellite image. Our approach designs a geometry-guided\ncross-view transformer that combines the benefits of conventional geometry and\nlearnable cross-view transformers to map the ground-view observations to an\noverhead view. Given the synthesized overhead view and observed satellite\nfeature maps, we construct a neural pose optimizer with strong global\ninformation embedding ability to estimate the relative rotation between them.\nAfter aligning their rotations, we develop an uncertainty-guided spatial\ncorrelation to generate a probability map of the vehicle locations, from which\nthe relative translation can be determined. Experimental results demonstrate\nthat our method significantly outperforms the state-of-the-art. Notably, the\nlikelihood of restricting the vehicle lateral pose to be within 1m of its\nGround Truth (GT) value on the cross-view KITTI dataset has been improved from\n$35.54\\%$ to $76.44\\%$, and the likelihood of restricting the vehicle\norientation to be within $1^{\\circ}$ of its GT value has been improved from\n$19.64\\%$ to $99.10\\%$.\n","authors":["Yujiao Shi","Fei Wu","Akhil Perincherry","Ankit Vora","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2307.08015v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2106.04066v6","updated":"2023-07-20T00:24:58Z","published":"2021-06-08T02:51:33Z","title":"Semantically Adversarial Scenario Generation with Explicit Knowledge\n Guidance","summary":" Generating adversarial scenarios, which have the potential to fail autonomous\ndriving systems, provides an effective way to improve robustness. Extending\npurely data-driven generative models, recent specialized models satisfy\nadditional controllable requirements such as embedding a traffic sign in a\ndriving scene by manipulating patterns implicitly in the neuron level. In this\npaper, we introduce a method to incorporate domain knowledge explicitly in the\ngeneration process to achieve the Semantically Adversarial Generation (SAG). To\nbe consistent with the composition of driving scenes, we first categorize the\nknowledge into two types, the property of objects and the relationship among\nobjects. We then propose a tree-structured variational auto-encoder (T-VAE) to\nlearn hierarchical scene representation. By imposing semantic rules on the\nproperties of nodes and edges in the tree structure, explicit knowledge\nintegration enables controllable generation. We construct a synthetic example\nto illustrate the controllability and explainability of our method in a\nsuccinct setting. We further extend to realistic environments for autonomous\nvehicles: our method efficiently identifies adversarial driving scenes against\ndifferent state-of-the-art 3D point cloud segmentation models and satisfies the\ntraffic rules specified as the explicit knowledge.\n","authors":["Wenhao Ding","Haohong Lin","Bo Li","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2106.04066v6.pdf","comment":"20 pages, 13 figures"},{"id":"http://arxiv.org/abs/2307.10507v1","updated":"2023-07-20T00:07:29Z","published":"2023-07-20T00:07:29Z","title":"FedSoup: Improving Generalization and Personalization in Federated\n Learning via Selective Model Interpolation","summary":" Cross-silo federated learning (FL) enables the development of machine\nlearning models on datasets distributed across data centers such as hospitals\nand clinical research laboratories. However, recent research has found that\ncurrent FL algorithms face a trade-off between local and global performance\nwhen confronted with distribution shifts. Specifically, personalized FL methods\nhave a tendency to overfit to local data, leading to a sharp valley in the\nlocal model and inhibiting its ability to generalize to out-of-distribution\ndata. In this paper, we propose a novel federated model soup method (i.e.,\nselective interpolation of model parameters) to optimize the trade-off between\nlocal and global performance. Specifically, during the federated training\nphase, each client maintains its own global model pool by monitoring the\nperformance of the interpolated model between the local and global models. This\nallows us to alleviate overfitting and seek flat minima, which can\nsignificantly improve the model's generalization performance. We evaluate our\nmethod on retinal and pathological image classification tasks, and our proposed\nmethod achieves significant improvements for out-of-distribution\ngeneralization. Our code is available at https://github.com/ubc-tea/FedSoup.\n","authors":["Minghui Chen","Meirui Jiang","Qi Dou","Zehua Wang","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2307.10507v1.pdf","comment":"Accepted by MICCAI2023"},{"id":"http://arxiv.org/abs/2307.10506v1","updated":"2023-07-20T00:06:46Z","published":"2023-07-20T00:06:46Z","title":"Is Grad-CAM Explainable in Medical Images?","summary":" Explainable Deep Learning has gained significant attention in the field of\nartificial intelligence (AI), particularly in domains such as medical imaging,\nwhere accurate and interpretable machine learning models are crucial for\neffective diagnosis and treatment planning. Grad-CAM is a baseline that\nhighlights the most critical regions of an image used in a deep learning\nmodel's decision-making process, increasing interpretability and trust in the\nresults. It is applied in many computer vision (CV) tasks such as\nclassification and explanation. This study explores the principles of\nExplainable Deep Learning and its relevance to medical imaging, discusses\nvarious explainability techniques and their limitations, and examines medical\nimaging applications of Grad-CAM. The findings highlight the potential of\nExplainable Deep Learning and Grad-CAM in improving the accuracy and\ninterpretability of deep learning models in medical imaging. The code is\navailable in (will be available).\n","authors":["Subhashis Suara","Aayush Jha","Pratik Sinha","Arif Ahmed Sekh"],"pdf_url":"https://arxiv.org/pdf/2307.10506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10504v1","updated":"2023-07-20T00:02:24Z","published":"2023-07-20T00:02:24Z","title":"Identifying Interpretable Subspaces in Image Representations","summary":" We propose Automatic Feature Explanation using Contrasting Concepts (FALCON),\nan interpretability framework to explain features of image representations. For\na target feature, FALCON captions its highly activating cropped images using a\nlarge captioning dataset (like LAION-400m) and a pre-trained vision-language\nmodel like CLIP. Each word among the captions is scored and ranked leading to a\nsmall number of shared, human-understandable concepts that closely describe the\ntarget feature. FALCON also applies contrastive interpretation using lowly\nactivating (counterfactual) images, to eliminate spurious concepts. Although\nmany existing approaches interpret features independently, we observe in\nstate-of-the-art self-supervised and supervised models, that less than 20% of\nthe representation space can be explained by individual features. We show that\nfeatures in larger spaces become more interpretable when studied in groups and\ncan be explained with high-order scoring concepts through FALCON. We discuss\nhow extracted concepts can be used to explain and debug failures in downstream\ntasks. Finally, we present a technique to transfer concepts from one\n(explainable) representation space to another unseen representation space by\nlearning a simple linear transformation.\n","authors":["Neha Kalibhat","Shweta Bhardwaj","Bayan Bruss","Hamed Firooz","Maziar Sanjabi","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2307.10504v1.pdf","comment":"Published at ICML 2023"},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer: Gated - Long, Short Sequence Transformer for Step Recognition\n in Surgical Videos","summary":" Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11261v1","updated":"2023-07-20T22:41:23Z","published":"2023-07-20T22:41:23Z","title":"SimCol3D -- 3D Reconstruction during Colonoscopy Challenge","summary":" Colorectal cancer is one of the most common cancers in the world. While\ncolonoscopy is an effective screening technique, navigating an endoscope\nthrough the colon to detect polyps is challenging. A 3D map of the observed\nsurfaces could enhance the identification of unscreened colon tissue and serve\nas a training platform. However, reconstructing the colon from video footage\nremains unsolved due to numerous factors such as self-occlusion, reflective\nsurfaces, lack of texture, and tissue deformation that limit feature-based\nmethods. Learning-based approaches hold promise as robust alternatives, but\nnecessitate extensive datasets. By establishing a benchmark, the 2022 EndoVis\nsub-challenge SimCol3D aimed to facilitate data-driven depth and pose\nprediction during colonoscopy. The challenge was hosted as part of MICCAI 2022\nin Singapore. Six teams from around the world and representatives from academia\nand industry participated in the three sub-challenges: synthetic depth\nprediction, synthetic pose prediction, and real pose prediction. This paper\ndescribes the challenge, the submitted methods, and their results. We show that\ndepth prediction in virtual colonoscopy is robustly solvable, while pose\nestimation remains an open research question.\n","authors":["Anita Rau","Sophia Bano","Yueming Jin","Pablo Azagra","Javier Morlana","Edward Sanderson","Bogdan J. Matuszewski","Jae Young Lee","Dong-Jae Lee","Erez Posner","Netanel Frank","Varshini Elangovan","Sista Raviteja","Zhengwen Li","Jiquan Liu","Seenivasan Lalithkumar","Mobarakol Islam","Hongliang Ren","José M. M. Montiel","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2307.11261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11259v1","updated":"2023-07-20T22:35:27Z","published":"2023-07-20T22:35:27Z","title":"Towards Non-Parametric Models for Confidence Aware Image Prediction from\n Low Data using Gaussian Processes","summary":" The ability to envision future states is crucial to informed decision making\nwhile interacting with dynamic environments. With cameras providing a prevalent\nand information rich sensing modality, the problem of predicting future states\nfrom image sequences has garnered a lot of attention. Current state of the art\nmethods typically train large parametric models for their predictions. Though\noften able to predict with accuracy, these models rely on the availability of\nlarge training datasets to converge to useful solutions. In this paper we focus\non the problem of predicting future images of an image sequence from very\nlittle training data. To approach this problem, we use non-parametric models to\ntake a probabilistic approach to image prediction. We generate probability\ndistributions over sequentially predicted images and propagate uncertainty\nthrough time to generate a confidence metric for our predictions. Gaussian\nProcesses are used for their data efficiency and ability to readily incorporate\nnew training data online. We showcase our method by successfully predicting\nfuture frames of a smooth fluid simulation environment.\n","authors":["Nikhil U. Shinde","Florian Richter","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2307.11259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11253v1","updated":"2023-07-20T22:09:04Z","published":"2023-07-20T22:09:04Z","title":"Joint one-sided synthetic unpaired image translation and segmentation\n for colorectal cancer prevention","summary":" Deep learning has shown excellent performance in analysing medical images.\nHowever, datasets are difficult to obtain due privacy issues, standardization\nproblems, and lack of annotations. We address these problems by producing\nrealistic synthetic images using a combination of 3D technologies and\ngenerative adversarial networks. We propose CUT-seg, a joint training where a\nsegmentation model and a generative model are jointly trained to produce\nrealistic images while learning to segment polyps. We take advantage of recent\none-sided translation models because they use significantly less memory,\nallowing us to add a segmentation model in the training loop. CUT-seg performs\nbetter, is computationally less expensive, and requires less real images than\nother memory-intensive image translation approaches that require two stage\ntraining. Promising results are achieved on five real polyp segmentation\ndatasets using only one real image and zero real annotations. As a part of this\nstudy we release Synth-Colon, an entirely synthetic dataset that includes 20000\nrealistic colon images and additional details about depth and 3D geometry:\nhttps://enric1994.github.io/synth-colon\n","authors":["Enric Moreu","Eric Arazo","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11253v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2202.08680"},{"id":"http://arxiv.org/abs/2307.11227v1","updated":"2023-07-20T20:45:13Z","published":"2023-07-20T20:45:13Z","title":"UP-DP: Unsupervised Prompt Learning for Data Pre-Selection with\n Vision-Language Models","summary":" In this study, we investigate the task of data pre-selection, which aims to\nselect instances for labeling from an unlabeled dataset through a single pass,\nthereby optimizing performance for undefined downstream tasks with a limited\nannotation budget. Previous approaches to data pre-selection relied solely on\nvisual features extracted from foundation models, such as CLIP and BLIP-2, but\nlargely ignored the powerfulness of text features. In this work, we argue that,\nwith proper design, the joint feature space of both vision and text can yield a\nbetter representation for data pre-selection. To this end, we introduce UP-DP,\na simple yet effective unsupervised prompt learning approach that adapts\nvision-language models, like BLIP-2, for data pre-selection. Specifically, with\nthe BLIP-2 parameters frozen, we train text prompts to extract the joint\nfeatures with improved representation, ensuring a diverse cluster structure\nthat covers the entire dataset. We extensively compare our method with the\nstate-of-the-art using seven benchmark datasets in different settings,\nachieving up to a performance gain of 20%. Interestingly, the prompts learned\nfrom one dataset demonstrate significant generalizability and can be applied\ndirectly to enhance the feature extraction of BLIP-2 from other datasets. To\nthe best of our knowledge, UP-DP is the first work to incorporate unsupervised\nprompt learning in a vision-language model for data pre-selection.\n","authors":["Xin Li","Sima Behpour","Thang Doan","Wenbin He","Liang Gou","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2307.11227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03297v2","updated":"2023-07-20T19:28:22Z","published":"2022-10-07T03:10:34Z","title":"Preprocessors Matter! Realistic Decision-Based Attacks on Machine\n Learning Systems","summary":" Decision-based attacks construct adversarial examples against a machine\nlearning (ML) model by making only hard-label queries. These attacks have\nmainly been applied directly to standalone neural networks. However, in\npractice, ML models are just one component of a larger learning system. We find\nthat by adding a single preprocessor in front of a classifier, state-of-the-art\nquery-based attacks are up to 7$\\times$ less effective at attacking a\nprediction pipeline than at attacking the model alone. We explain this\ndiscrepancy by the fact that most preprocessors introduce some notion of\ninvariance to the input space. Hence, attacks that are unaware of this\ninvariance inevitably waste a large number of queries to re-discover or\novercome it. We, therefore, develop techniques to (i) reverse-engineer the\npreprocessor and then (ii) use this extracted information to attack the\nend-to-end system. Our preprocessors extraction method requires only a few\nhundred queries, and our preprocessor-aware attacks recover the same efficacy\nas when attacking the model alone. The code can be found at\nhttps://github.com/google-research/preprocessor-aware-black-box-attack.\n","authors":["Chawin Sitawarin","Florian Tramèr","Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2210.03297v2.pdf","comment":"ICML 2023. Code can be found at\n https://github.com/google-research/preprocessor-aware-black-box-attack"},{"id":"http://arxiv.org/abs/2302.11827v2","updated":"2023-07-20T19:21:51Z","published":"2023-02-23T07:26:50Z","title":"Open Challenges for Monocular Single-shot 6D Object Pose Estimation","summary":" Object pose estimation is a non-trivial task that enables robotic\nmanipulation, bin picking, augmented reality, and scene understanding, to name\na few use cases. Monocular object pose estimation gained considerable momentum\nwith the rise of high-performing deep learning-based solutions and is\nparticularly interesting for the community since sensors are inexpensive and\ninference is fast. Prior works establish the comprehensive state of the art for\ndiverse pose estimation problems. Their broad scopes make it difficult to\nidentify promising future directions. We narrow down the scope to the problem\nof single-shot monocular 6D object pose estimation, which is commonly used in\nrobotics, and thus are able to identify such trends. By reviewing recent\npublications in robotics and computer vision, the state of the art is\nestablished at the union of both fields. Following that, we identify promising\nresearch directions in order to help researchers to formulate relevant research\nideas and effectively advance the state of the art. Findings include that\nmethods are sophisticated enough to overcome the domain shift and that\nocclusion handling is a fundamental challenge. We also highlight problems such\nas novel object pose estimation and challenging materials handling as central\nchallenges to advance robotics.\n","authors":["Stefan Thalhammer","Peter Hönig","Jean-Baptiste Weibel","Markus Vincze"],"pdf_url":"https://arxiv.org/pdf/2302.11827v2.pdf","comment":"Revised version in the making"},{"id":"http://arxiv.org/abs/2307.11197v1","updated":"2023-07-20T19:20:35Z","published":"2023-07-20T19:20:35Z","title":"Heuristic Hyperparameter Choice for Image Anomaly Detection","summary":" Anomaly detection (AD) in images is a fundamental computer vision problem by\ndeep learning neural network to identify images deviating significantly from\nnormality. The deep features extracted from pretrained models have been proved\nto be essential for AD based on multivariate Gaussian distribution analysis.\nHowever, since models are usually pretrained on a large dataset for\nclassification tasks such as ImageNet, they might produce lots of redundant\nfeatures for AD, which increases computational cost and degrades the\nperformance. We aim to do the dimension reduction of Negated Principal\nComponent Analysis (NPCA) for these features. So we proposed some heuristic to\nchoose hyperparameter of NPCA algorithm for getting as fewer components of\nfeatures as possible while ensuring a good performance.\n","authors":["Zeyu Jiang","João P. C. Bertoldo","Etienne Decencière"],"pdf_url":"https://arxiv.org/pdf/2307.11197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03512v2","updated":"2023-07-20T18:27:42Z","published":"2023-07-07T11:00:44Z","title":"Tranfer Learning of Semantic Segmentation Methods for Identifying Buried\n Archaeological Structures on LiDAR Data","summary":" When applying deep learning to remote sensing data in archaeological\nresearch, a notable obstacle is the limited availability of suitable datasets\nfor training models. The application of transfer learning is frequently\nemployed to mitigate this drawback. However, there is still a need to explore\nits effectiveness when applied across different archaeological datasets. This\npaper compares the performance of various transfer learning configurations\nusing two semantic segmentation deep neural networks on two LiDAR datasets. The\nexperimental results indicate that transfer learning-based approaches in\narchaeology can lead to performance improvements, although a systematic\nenhancement has not yet been observed. We provide specific insights about the\nvalidity of such techniques that can serve as a baseline for future works.\n","authors":["Paolo Soleni","Wouter B. Verschoof-van der Vaart","Žiga Kokalj","Arianna Traviglia","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.03512v2.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2023 (IGARSS 2023) @IEEE copyright"},{"id":"http://arxiv.org/abs/2307.11141v1","updated":"2023-07-20T17:53:04Z","published":"2023-07-20T17:53:04Z","title":"Towards General Game Representations: Decomposing Games Pixels into\n Content and Style","summary":" On-screen game footage contains rich contextual information that players\nprocess when playing and experiencing a game. Learning pixel representations of\ngames can benefit artificial intelligence across several downstream tasks\nincluding game-playing agents, procedural content generation, and player\nmodelling. The generalizability of these methods, however, remains a challenge,\nas learned representations should ideally be shared across games with similar\ngame mechanics. This could allow, for instance, game-playing agents trained on\none game to perform well in similar games with no re-training. This paper\nexplores how generalizable pre-trained computer vision encoders can be for such\ntasks, by decomposing the latent space into content embeddings and style\nembeddings. The goal is to minimize the domain gap between games of the same\ngenre when it comes to game content critical for downstream tasks, and ignore\ndifferences in graphical style. We employ a pre-trained Vision Transformer\nencoder and a decomposition technique based on game genres to obtain separate\ncontent and style embeddings. Our findings show that the decomposed embeddings\nachieve style invariance across multiple games while still maintaining strong\ncontent extraction capabilities. We argue that the proposed decomposition of\ncontent and style offers better generalization capacities across game\nenvironments independently of the downstream task.\n","authors":["Chintan Trivedi","Konstantinos Makantasis","Antonios Liapis","Georgios N. Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2307.11141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.10495v2","updated":"2023-07-20T16:33:52Z","published":"2022-10-19T12:04:47Z","title":"ADPS: Asymmetric Distillation Post-Segmentation Method for Image Anomaly\n Detection","summary":" Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the\nteacher-student paradigm to detect and segment anomalous regions by contrasting\nthe unique features extracted by both networks. However, existing KDAD methods\nsuffer from two main limitations: 1) the student network can effortlessly\nreplicate the teacher network's representations, and 2) the features of the\nteacher network serve solely as a ``reference standard\" and are not fully\nleveraged. Toward this end, we depart from the established paradigm and instead\npropose an innovative approach called Asymmetric Distillation Post-Segmentation\n(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes\ndistinct forms of the same image as the input of the teacher-student networks,\ndriving the student network to learn discriminating representations for\nanomalous regions.\n Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a\ncoarse anomaly localization mask that transfers the distilled knowledge\nacquired from the asymmetric paradigm to the teacher network. Equipped with\nWMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect\nand segment abnormal regions with fine structures and clear boundaries.\nExperimental results demonstrate that the proposed ADPS outperforms the\nstate-of-the-art methods in detecting and segmenting anomalies. Surprisingly,\nADPS significantly improves Average Precision (AP) metric by 9% and 20% on the\nMVTec AD and KolektorSDD2 datasets, respectively.\n","authors":["Peng Xing","Hao Tang","Jinhui Tang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2210.10495v2.pdf","comment":"11pages,9 figures"},{"id":"http://arxiv.org/abs/2307.11130v1","updated":"2023-07-20T16:07:02Z","published":"2023-07-20T16:07:02Z","title":"Frequency-aware optical coherence tomography image super-resolution via\n conditional generative adversarial neural network","summary":" Optical coherence tomography (OCT) has stimulated a wide range of medical\nimage-based diagnosis and treatment in fields such as cardiology and\nophthalmology. Such applications can be further facilitated by deep\nlearning-based super-resolution technology, which improves the capability of\nresolving morphological structures. However, existing deep learning-based\nmethod only focuses on spatial distribution and disregard frequency fidelity in\nimage reconstruction, leading to a frequency bias. To overcome this limitation,\nwe propose a frequency-aware super-resolution framework that integrates three\ncritical frequency-based modules (i.e., frequency transformation, frequency\nskip connection, and frequency alignment) and frequency-based loss function\ninto a conditional generative adversarial network (cGAN). We conducted a\nlarge-scale quantitative study from an existing coronary OCT dataset to\ndemonstrate the superiority of our proposed framework over existing deep\nlearning frameworks. In addition, we confirmed the generalizability of our\nframework by applying it to fish corneal images and rat retinal images,\ndemonstrating its capability to super-resolve morphological details in eye\nimaging.\n","authors":["Xueshen Li","Zhenxing Dong","Hongshan Liu","Jennifer J. Kang-Mieler","Yuye Ling","Yu Gan"],"pdf_url":"https://arxiv.org/pdf/2307.11130v1.pdf","comment":"13 pages, 7 figures, submitted to Biomedical Optics Express special\n issue"},{"id":"http://arxiv.org/abs/2307.11118v1","updated":"2023-07-20T14:37:30Z","published":"2023-07-20T14:37:30Z","title":"Diffusion Sampling with Momentum for Mitigating Divergence Artifacts","summary":" Despite the remarkable success of diffusion models in image generation, slow\nsampling remains a persistent issue. To accelerate the sampling process, prior\nstudies have reformulated diffusion sampling as an ODE/SDE and introduced\nhigher-order numerical methods. However, these methods often produce divergence\nartifacts, especially with a low number of sampling steps, which limits the\nachievable acceleration. In this paper, we investigate the potential causes of\nthese artifacts and suggest that the small stability regions of these methods\ncould be the principal cause. To address this issue, we propose two novel\ntechniques. The first technique involves the incorporation of Heavy Ball (HB)\nmomentum, a well-known technique for improving optimization, into existing\ndiffusion numerical methods to expand their stability regions. We also prove\nthat the resulting methods have first-order convergence. The second technique,\ncalled Generalized Heavy Ball (GHVB), constructs a new high-order method that\noffers a variable trade-off between accuracy and artifact suppression.\nExperimental results show that our techniques are highly effective in reducing\nartifacts and improving image quality, surpassing state-of-the-art diffusion\nsolvers on both pixel-based and latent-based diffusion models for low-step\nsampling. Our research provides novel insights into the design of numerical\nmethods for future diffusion work.\n","authors":["Suttisak Wizadwongsa","Worameth Chinchuthakun","Pramook Khungurn","Amit Raj","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2307.11118v1.pdf","comment":"Project page: https://github.com/sWizad/momentum-diffusion"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.11019v1","updated":"2023-07-20T16:46:10Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n with Retrieval Augmentation","summary":" Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.11876v3","updated":"2023-07-20T10:42:36Z","published":"2021-05-25T12:23:24Z","title":"Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior\n Implicit Recommendation","summary":" Recent years have witnessed the explosive growth of interaction behaviors in\nmultimedia information systems, where multi-behavior recommender systems have\nreceived increasing attention by leveraging data from various auxiliary\nbehaviors such as tip and collect. Among various multi-behavior recommendation\nmethods, non-sampling methods have shown superiority over negative sampling\nmethods. However, two observations are usually ignored in existing\nstate-of-the-art non-sampling methods based on binary regression: (1) users\nhave different preference strengths for different items, so they cannot be\nmeasured simply by binary implicit data; (2) the dependency across multiple\nbehaviors varies for different users and items. To tackle the above issue, we\npropose a novel non-sampling learning framework named Criterion-guided\nHeterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and\nlower thresholds to indicate selection criteria, which will guide user\npreference learning. Besides, CHCF integrates criterion learning and user\npreference learning into a unified framework, which can be trained jointly for\nthe interaction prediction of the target behavior. We further theoretically\ndemonstrate that the optimization of Collaborative Metric Learning can be\napproximately achieved by the CHCF learning framework in a non-sampling form\neffectively. Extensive experiments on three real-world datasets show the\neffectiveness of CHCF in heterogeneous scenarios.\n","authors":["Xiao Luo","Daqing Wu","Yiyang Gu","Chong Chen","Luchen Liu","Jinwen Ma","Ming Zhang","Minghua Deng","Jianqiang Huang","Xian-Sheng Hua"],"pdf_url":"https://arxiv.org/pdf/2105.11876v3.pdf","comment":"Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD)"},{"id":"http://arxiv.org/abs/2307.10747v1","updated":"2023-07-20T10:19:47Z","published":"2023-07-20T10:19:47Z","title":"Enhancing Job Recommendation through LLM-based Generative Adversarial\n Networks","summary":" Recommending suitable jobs to users is a critical task in online recruitment\nplatforms, as it can enhance users' satisfaction and the platforms'\nprofitability. While existing job recommendation methods encounter challenges\nsuch as the low quality of users' resumes, which hampers their accuracy and\npractical effectiveness. With the rapid development of large language models\n(LLMs), utilizing the rich external knowledge encapsulated within them, as well\nas their powerful capabilities of text processing and reasoning, is a promising\nway to complete users' resumes for more accurate recommendations. However,\ndirectly leveraging LLMs to enhance recommendation results is not a\none-size-fits-all solution, as LLMs may suffer from fabricated generation and\nfew-shot problems, which degrade the quality of resume completion. In this\npaper, we propose a novel LLM-based approach for job recommendation. To\nalleviate the limitation of fabricated generation for LLMs, we extract accurate\nand valuable information beyond users' self-description, which helps the LLMs\nbetter profile users for resume completion. Specifically, we not only extract\nusers' explicit properties (e.g., skills, interests) from their\nself-description but also infer users' implicit characteristics from their\nbehaviors for more accurate and meaningful resume completion. Nevertheless,\nsome users still suffer from few-shot problems, which arise due to scarce\ninteraction records, leading to limited guidance for the models in generating\nhigh-quality resumes. To address this issue, we propose aligning unpaired\nlow-quality with high-quality generated resumes by Generative Adversarial\nNetworks (GANs), which can refine the resume representations for better\nrecommendation results. Extensive experiments on three large real-world\nrecruitment datasets demonstrate the effectiveness of our proposed method.\n","authors":["Yingpeng Du","Di Luo","Rui Yan","Hongzhi Liu","Yang Song","Hengshu Zhu","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10747v1.pdf","comment":"13 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2011.00696v2","updated":"2023-07-20T08:56:26Z","published":"2020-11-02T03:07:38Z","title":"ABNIRML: Analyzing the Behavior of Neural IR Models","summary":" Pretrained contextualized language models such as BERT and T5 have\nestablished a new state-of-the-art for ad-hoc search. However, it is not yet\nwell-understood why these methods are so effective, what makes some variants\nmore effective than others, and what pitfalls they may have. We present a new\ncomprehensive framework for Analyzing the Behavior of Neural IR ModeLs\n(ABNIRML), which includes new types of diagnostic probes that allow us to test\nseveral characteristics -- such as writing styles, factuality, sensitivity to\nparaphrasing and word order -- that are not addressed by previous techniques.\nTo demonstrate the value of the framework, we conduct an extensive empirical\nstudy that yields insights into the factors that contribute to the neural\nmodel's gains, and identify potential unintended biases the models exhibit.\nSome of our results confirm conventional wisdom, like that recent neural\nranking models rely less on exact term overlap with the query, and instead\nleverage richer linguistic information, evidenced by their higher sensitivity\nto word and sentence order. Other results are more surprising, such as that\nsome models (e.g., T5 and ColBERT) are biased towards factually correct (rather\nthan simply relevant) texts. Further, some characteristics vary even for the\nsame base language model, and other characteristics can appear due to random\nvariations during model training.\n","authors":["Sean MacAvaney","Sergey Feldman","Nazli Goharian","Doug Downey","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2011.00696v2.pdf","comment":"TACL version"},{"id":"http://arxiv.org/abs/2307.10702v1","updated":"2023-07-20T08:47:54Z","published":"2023-07-20T08:47:54Z","title":"A Constraint-based Recommender System via RDF Knowledge Graphs","summary":" Knowledge graphs, represented in RDF, are able to model entities and their\nrelations by means of ontologies. The use of knowledge graphs for information\nmodeling has attracted interest in recent years. In recommender systems, items\nand users can be mapped and integrated into the knowledge graph, which can\nrepresent more links and relationships between users and items.\nConstraint-based recommender systems are based on the idea of explicitly\nexploiting deep recommendation knowledge through constraints to identify\nrelevant recommendations. When combined with knowledge graphs, a\nconstraint-based recommender system gains several benefits in terms of\nconstraint sets. In this paper, we investigate and propose the construction of\na constraint-based recommender system via RDF knowledge graphs applied to the\nvehicle purchase/sale domain. The results of our experiments show that the\nproposed approach is able to efficiently identify recommendations in accordance\nwith user preferences.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2307.10702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10680v1","updated":"2023-07-20T08:14:06Z","published":"2023-07-20T08:14:06Z","title":"A Personalized Recommender System Based-on Knowledge Graph Embeddings","summary":" Knowledge graphs have proven to be effective for modeling entities and their\nrelationships through the use of ontologies. The recent emergence in interest\nfor using knowledge graphs as a form of information modeling has led to their\nincreased adoption in recommender systems. By incorporating users and items\ninto the knowledge graph, these systems can better capture the implicit\nconnections between them and provide more accurate recommendations. In this\npaper, we investigate and propose the construction of a personalized\nrecommender system via knowledge graphs embedding applied to the vehicle\npurchase/sale domain. The results of our experimentation demonstrate the\nefficacy of the proposed method in providing relevant recommendations that are\nconsistent with individual users.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2307.10680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10650v1","updated":"2023-07-20T07:30:27Z","published":"2023-07-20T07:30:27Z","title":"Language-Enhanced Session-Based Recommendation with Decoupled\n Contrastive Learning","summary":" Session-based recommendation techniques aim to capture dynamic user behavior\nby analyzing past interactions. However, existing methods heavily rely on\nhistorical item ID sequences to extract user preferences, leading to challenges\nsuch as popular bias and cold-start problems. In this paper, we propose a\nhybrid multimodal approach for session-based recommendation to address these\nchallenges. Our approach combines different modalities, including textual\ncontent and item IDs, leveraging the complementary nature of these modalities\nusing CatBoost. To learn universal item representations, we design a language\nrepresentation-based item retrieval architecture that extracts features from\nthe textual content utilizing pre-trained language models. Furthermore, we\nintroduce a novel Decoupled Contrastive Learning method to enhance the\neffectiveness of the language representation. This technique decouples the\nsequence representation and item representation space, facilitating\nbidirectional alignment through dual-queue contrastive learning.\nSimultaneously, the momentum queue provides a large number of negative samples,\neffectively enhancing the effectiveness of contrastive learning. Our approach\nyielded competitive results, securing a 5th place ranking in KDD CUP 2023 Task\n1. We have released the source code and pre-trained models associated with this\nwork.\n","authors":["Zhipeng Zhang","Piao Tong","Yingwei Ma","Qiao Liu","Xujiang Liu","Xu Luo"],"pdf_url":"https://arxiv.org/pdf/2307.10650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10639v1","updated":"2023-07-20T07:08:25Z","published":"2023-07-20T07:08:25Z","title":"Improving Semantic Similarity Measure Within a Recommender System\n Based-on RDF Graphs","summary":" In today's era of information explosion, more users are becoming more reliant\nupon recommender systems to have better advice, suggestions, or inspire them.\nThe measure of the semantic relatedness or likeness between terms, words, or\ntext data plays an important role in different applications dealing with\ntextual data, as in a recommender system. Over the past few years, many\nontologies have been developed and used as a form of structured representation\nof knowledge bases for information systems. The measure of semantic similarity\nfrom ontology has developed by several methods. In this paper, we propose and\ncarry on an approach for the improvement of semantic similarity calculations\nwithin a recommender system based-on RDF graphs.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2307.10639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10617v1","updated":"2023-07-20T06:35:43Z","published":"2023-07-20T06:35:43Z","title":"Detecting deceptive reviews using text classification","summary":" In recent years, online reviews play a vital role for promoting any kind of\nproduct or services. Businesses may embed fake reviews in order to attract\ncustomers to purchase their products. They may even highlight the benefits of\ntheir own product or criticize the competition's product. Marketers,\nadvertisers, and other online business users have incentive to create fake\npositive reviews for products which they want to promote or give fake negative\nreviews for products which they really don't like. So now-a-days writing a\ndeceptive review is inevitable thing for promoting their own business or\ndegrading competitor's reputation. Thus, identifying deceptive reviews is an\nintense and on-going research area. This research paper proposes machine\nlearning model approach to identify deceptive reviews. The paper investigates\nthe performance of the several experiments done on a Deceptive Opinion Spam\nCorpus dataset of restaurants reviews. We developed a n-gram model and max\nfeatures to identify deceptive contents with a particular focus on fake\nreviews. Further, we conduct a benchmark study to investigate the performance\nof two different features extraction techniques and apply five machine learning\nclassification techniques. The experimental results show that passive\naggressive classifier outperforms other algorithms, and it reaches the highest\naccuracy not only in text classification but also to fake reviews. We also\nstudy the data augmentation and implement different deep learning techniques.\n","authors":["Anusuya Baby"],"pdf_url":"https://arxiv.org/pdf/2307.10617v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2306.11296v2","updated":"2023-07-20T02:20:35Z","published":"2023-06-20T05:20:29Z","title":"ChatGPT Chemistry Assistant for Text Mining and Prediction of MOF\n Synthesis","summary":" We use prompt engineering to guide ChatGPT in the automation of text mining\nof metal-organic frameworks (MOFs) synthesis conditions from diverse formats\nand styles of the scientific literature. This effectively mitigates ChatGPT's\ntendency to hallucinate information -- an issue that previously made the use of\nLarge Language Models (LLMs) in scientific fields challenging. Our approach\ninvolves the development of a workflow implementing three different processes\nfor text mining, programmed by ChatGPT itself. All of them enable parsing,\nsearching, filtering, classification, summarization, and data unification with\ndifferent tradeoffs between labor, speed, and accuracy. We deploy this system\nto extract 26,257 distinct synthesis parameters pertaining to approximately 800\nMOFs sourced from peer-reviewed research articles. This process incorporates\nour ChemPrompt Engineering strategy to instruct ChatGPT in text mining,\nresulting in impressive precision, recall, and F1 scores of 90-99%.\nFurthermore, with the dataset built by text mining, we constructed a\nmachine-learning model with over 86% accuracy in predicting MOF experimental\ncrystallization outcomes and preliminarily identifying important factors in MOF\ncrystallization. We also developed a reliable data-grounded MOF chatbot to\nanswer questions on chemical reactions and synthesis procedures. Given that the\nprocess of using ChatGPT reliably mines and tabulates diverse MOF synthesis\ninformation in a unified format, while using only narrative language requiring\nno coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be\nvery useful across various other chemistry sub-disciplines.\n","authors":["Zhiling Zheng","Oufan Zhang","Christian Borgs","Jennifer T. Chayes","Omar M. Yaghi"],"pdf_url":"https://arxiv.org/pdf/2306.11296v2.pdf","comment":"Published on Journal of the American Chemical Society (2023); 102\n pages (18-page manuscript, 84 pages of supporting information)"},{"id":"http://arxiv.org/abs/2307.11224v1","updated":"2023-07-20T20:37:24Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n Models","summary":" Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. While\nthese models are not exclusively designed for text generation, they excel in\napplications such as dense retrieval and semantic textual similarity. This\npaper details the development of Jina Embeddings, starting with the creation of\na high-quality pairwise and triplet dataset. It underlines the crucial role of\ndata cleaning in dataset preparation, gives in-depth insights into the model\ntraining process, and concludes with a comprehensive performance evaluation\nusing the Massive Textual Embedding Benchmark (MTEB).\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v1.pdf","comment":"9 pages, 2 page appendix, EMNLP 2023 Industrial Track"},{"id":"http://arxiv.org/abs/2307.11140v1","updated":"2023-07-20T17:52:47Z","published":"2023-07-20T17:52:47Z","title":"RCVaR: an Economic Approach to Estimate Cyberattacks Costs using Data\n from Industry Reports","summary":" Digitization increases business opportunities and the risk of companies being\nvictims of devastating cyberattacks. Therefore, managing risk exposure and\ncybersecurity strategies is essential for digitized companies that want to\nsurvive in competitive markets. However, understanding company-specific risks\nand quantifying their associated costs is not trivial. Current approaches fail\nto provide individualized and quantitative monetary estimations of\ncybersecurity impacts. Due to limited resources and technical expertise, SMEs\nand even large companies are affected and struggle to quantify their\ncyberattack exposure. Therefore, novel approaches must be placed to support the\nunderstanding of the financial loss due to cyberattacks. This article\nintroduces the Real Cyber Value at Risk (RCVaR), an economical approach for\nestimating cybersecurity costs using real-world information from public\ncybersecurity reports. RCVaR identifies the most significant cyber risk factors\nfrom various sources and combines their quantitative results to estimate\nspecific cyberattacks costs for companies. Furthermore, RCVaR extends current\nmethods to achieve cost and risk estimations based on historical real-world\ndata instead of only probability-based simulations. The evaluation of the\napproach on unseen data shows the accuracy and efficiency of the RCVaR in\npredicting and managing cyber risks. Thus, it shows that the RCVaR is a\nvaluable addition to cybersecurity planning and risk management processes.\n","authors":["Muriel Figueredo Franco","Fabian Künzler","Jan von der Assen","Chao Feng","Burkhard Stiller"],"pdf_url":"https://arxiv.org/pdf/2307.11140v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.11091v1","updated":"2023-07-20T17:59:59Z","published":"2023-07-20T17:59:59Z","title":"Data-driven criteria for quantum correlations","summary":" We build a machine learning model to detect correlations in a three-qubit\nsystem using a neural network trained in an unsupervised manner on randomly\ngenerated states. The network is forced to recognize separable states, and\ncorrelated states are detected as anomalies. Quite surprisingly, we find that\nthe proposed detector performs much better at distinguishing a weaker form of\nquantum correlations, namely, the quantum discord, than entanglement. In fact,\nit has a tendency to grossly overestimate the set of entangled states even at\nthe optimal threshold for entanglement detection, while it underestimates the\nset of discordant states to a much lesser extent. In order to illustrate the\nnature of states classified as quantum-correlated, we construct a diagram\ncontaining various types of states -- entangled, as well as separable, both\ndiscordant and non-discordant. We find that the near-zero value of the\nrecognition loss reproduces the shape of the non-discordant separable states\nwith high accuracy, especially considering the non-trivial shape of this set on\nthe diagram. The network architecture is designed carefully: it preserves\nseparability, and its output is equivariant with respect to qubit permutations.\nWe show that the choice of architecture is important to get the highest\ndetection accuracy, much better than for a baseline model that just utilizes a\npartial trace operation.\n","authors":["Mateusz Krawczyk","Jarosław Pawłowski","Maciej M. Maśka","Katarzyna Roszak"],"pdf_url":"https://arxiv.org/pdf/2307.11091v1.pdf","comment":"7 pages, 3 figures, 3 tables, and extra 5 pages of supplementary\n materials"},{"id":"http://arxiv.org/abs/2307.11086v1","updated":"2023-07-20T17:59:33Z","published":"2023-07-20T17:59:33Z","title":"PAPR: Proximity Attention Point Rendering","summary":" Learning accurate and parsimonious point cloud representations of scene\nsurfaces from scratch remains a challenge in 3D representation learning.\nExisting point-based methods often suffer from the vanishing gradient problem\nor require a large number of points to accurately model scene geometry and\ntexture. To address these limitations, we propose Proximity Attention Point\nRendering (PAPR), a novel method that consists of a point-based scene\nrepresentation and a differentiable renderer. Our scene representation uses a\npoint cloud where each point is characterized by its spatial position,\nforeground score, and view-independent feature vector. The renderer selects the\nrelevant points for each ray and produces accurate colours using their\nassociated features. PAPR effectively learns point cloud positions to represent\nthe correct scene geometry, even when the initialization drastically differs\nfrom the target geometry. Notably, our method captures fine texture details\nwhile using only a parsimonious set of points. We also demonstrate four\npractical applications of our method: geometry editing, object manipulation,\ntexture transfer, and exposure control. More results and code are available on\nour project website at https://zvict.github.io/papr/.\n","authors":["Yanshu Zhang","Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2307.11086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07269v2","updated":"2023-07-20T17:59:25Z","published":"2023-07-14T10:50:43Z","title":"Frequency Domain Adversarial Training for Robust Volumetric Medical\n Segmentation","summary":" It is imperative to ensure the robustness of deep learning models in critical\napplications such as, healthcare. While recent advances in deep learning have\nimproved the performance of volumetric medical image segmentation models, these\nmodels cannot be deployed for real-world applications immediately due to their\nvulnerability to adversarial attacks. We present a 3D frequency domain\nadversarial attack for volumetric medical image segmentation models and\ndemonstrate its advantages over conventional input or voxel domain attacks.\nUsing our proposed attack, we introduce a novel frequency domain adversarial\ntraining approach for optimizing a robust model against voxel and frequency\ndomain attacks. Moreover, we propose frequency consistency loss to regulate our\nfrequency domain adversarial training that achieves a better tradeoff between\nmodel's performance on clean and adversarial samples. Code is publicly\navailable at https://github.com/asif-hanif/vafa.\n","authors":["Asif Hanif","Muzammal Naseer","Salman Khan","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.07269v2.pdf","comment":"This paper has been accepted in MICCAI 2023 conference"},{"id":"http://arxiv.org/abs/2301.13867v2","updated":"2023-07-20T17:59:14Z","published":"2023-01-31T18:59:03Z","title":"Mathematical Capabilities of ChatGPT","summary":" We investigate the mathematical capabilities of two iterations of ChatGPT\n(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on\npublicly available datasets, as well as hand-crafted ones, using a novel\nmethodology. In contrast to formal mathematics, where large databases of formal\nproofs are available (e.g., the Lean Mathematical Library), current datasets of\nnatural-language mathematics, used to benchmark language models, either cover\nonly elementary mathematics or are very small. We address this by publicly\nreleasing two new datasets: GHOSTS and miniGHOSTS. These are the first\nnatural-language datasets curated by working researchers in mathematics that\n(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of\nthe mathematical capabilities of language models, and (3) distinguish multiple\ndimensions of mathematical reasoning. These datasets also test whether ChatGPT\nand GPT-4 can be helpful assistants to professional mathematicians by emulating\nuse cases that arise in the daily professional activities of mathematicians. We\nbenchmark the models on a range of fine-grained performance metrics. For\nadvanced mathematics, this is the most detailed evaluation effort to date. We\nfind that ChatGPT can be used most successfully as a mathematical assistant for\nquerying facts, acting as a mathematical search engine and knowledge base\ninterface. GPT-4 can additionally be used for undergraduate-level mathematics\nbut fails on graduate-level difficulty. Contrary to many positive reports in\nthe media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of\nselection bias), their overall mathematical performance is well below the level\nof a graduate student. Hence, if your goal is to use ChatGPT to pass a\ngraduate-level math exam, you would be better off copying from your average\npeer!\n","authors":["Simon Frieder","Luca Pinchetti","Alexis Chevalier","Ryan-Rhys Griffiths","Tommaso Salvatori","Thomas Lukasiewicz","Philipp Christian Petersen","Julius Berner"],"pdf_url":"https://arxiv.org/pdf/2301.13867v2.pdf","comment":"Added further evaluations on another ChatGPT version and on GPT-4.\n The GHOSTS and miniGHOSTS datasets are available at\n https://github.com/xyfrieder/science-GHOSTS"},{"id":"http://arxiv.org/abs/2307.11085v1","updated":"2023-07-20T17:59:11Z","published":"2023-07-20T17:59:11Z","title":"Representation Learning in Anomaly Detection: Successes, Limits and a\n Grand Challenge","summary":" In this perspective paper, we argue that the dominant paradigm in anomaly\ndetection cannot scale indefinitely and will eventually hit fundamental limits.\nThis is due to the a no free lunch principle for anomaly detection. These\nlimitations can be overcome when there are strong tasks priors, as is the case\nfor many industrial tasks. When such priors do not exists, the task is much\nharder for anomaly detection. We pose two such tasks as grand challenges for\nanomaly detection: i) scientific discovery by anomaly detection ii) a\n\"mini-grand\" challenge of detecting the most anomalous image in the ImageNet\ndataset. We believe new anomaly detection tools and ideas would need to be\ndeveloped to overcome these challenges.\n","authors":["Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2307.11085v1.pdf","comment":"Keynote talk at the Visual Anomaly and Novelty Detection Workshop,\n CVPR'23"},{"id":"http://arxiv.org/abs/2205.09208v2","updated":"2023-07-20T17:57:36Z","published":"2022-05-18T20:34:25Z","title":"Torchhd: An Open Source Python Library to Support Research on\n Hyperdimensional Computing and Vector Symbolic Architectures","summary":" Hyperdimensional computing (HD), also known as vector symbolic architectures\n(VSA), is a framework for computing with distributed representations by\nexploiting properties of random high-dimensional vector spaces. The commitment\nof the scientific community to aggregate and disseminate research in this\nparticularly multidisciplinary area has been fundamental for its advancement.\nJoining these efforts, we present Torchhd, a high-performance open source\nPython library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and\nserves as an efficient foundation for further research and application\ndevelopment. The easy-to-use library builds on top of PyTorch and features\nstate-of-the-art HD/VSA functionality, clear documentation, and implementation\nexamples from well-known publications. Comparing publicly available code with\ntheir corresponding Torchhd implementation shows that experiments can run up to\n100x faster. Torchhd is available at:\nhttps://github.com/hyperdimensional-computing/torchhd.\n","authors":["Mike Heddes","Igor Nunes","Pere Vergés","Denis Kleyko","Danny Abraham","Tony Givargis","Alexandru Nicolau","Alexander Veidenbaum"],"pdf_url":"https://arxiv.org/pdf/2205.09208v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer : Gated - Long, Short Sequence Transformer for Step\n Recognition in Surgical Videos","summary":" Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11078v1","updated":"2023-07-20T17:55:17Z","published":"2023-07-20T17:55:17Z","title":"Brain2Music: Reconstructing Music from Human Brain Activity","summary":" The process of reconstructing experiences from human brain activity offers a\nunique lens into how the brain interprets and represents the world. In this\npaper, we introduce a method for reconstructing music from brain activity,\ncaptured using functional magnetic resonance imaging (fMRI). Our approach uses\neither music retrieval or the MusicLM music generation model conditioned on\nembeddings derived from fMRI data. The generated music resembles the musical\nstimuli that human subjects experienced, with respect to semantic properties\nlike genre, instrumentation, and mood. We investigate the relationship between\ndifferent components of MusicLM and brain activity through a voxel-wise\nencoding modeling analysis. Furthermore, we discuss which brain regions\nrepresent information derived from purely textual descriptions of music\nstimuli. We provide supplementary material including examples of the\nreconstructed music at https://google-research.github.io/seanet/brain2music\n","authors":["Timo I. Denk","Yu Takagi","Takuya Matsuyama","Andrea Agostinelli","Tomoya Nakai","Christian Frank","Shinji Nishimoto"],"pdf_url":"https://arxiv.org/pdf/2307.11078v1.pdf","comment":"Preprint; 21 pages; supplementary material:\n https://google-research.github.io/seanet/brain2music"},{"id":"http://arxiv.org/abs/2307.11077v1","updated":"2023-07-20T17:55:14Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":" The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v1.pdf","comment":"Accepted by ICCV 2023. Code and Models are publicly available.\n Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2307.11069v1","updated":"2023-07-20T17:52:19Z","published":"2023-07-20T17:52:19Z","title":"Effectiveness and predictability of in-network storage cache for\n scientific workflows","summary":" Large scientific collaborations often have multiple scientists accessing the\nsame set of files while doing different analyses, which create repeated\naccesses to the large amounts of shared data located far away. These data\naccesses have long latency due to distance and occupy the limited bandwidth\navailable over the wide-area network. To reduce the wide-area network traffic\nand the data access latency, regional data storage caches have been installed\nas a new networking service. To study the effectiveness of such a cache system\nin scientific applications, we examine the Southern California Petabyte Scale\nCache for a high-energy physics experiment. By examining about 3TB of\noperational logs, we show that this cache removed 67.6% of file requests from\nthe wide-area network and reduced the traffic volume on wide-area network by\n12.3TB (or 35.4%) an average day. The reduction in the traffic volume (35.4%)\nis less than the reduction in file counts (67.6%) because the larger files are\nless likely to be reused. Due to this difference in data access patterns, the\ncache system has implemented a policy to avoid evicting smaller files when\nprocessing larger files. We also build a machine learning model to study the\npredictability of the cache behavior. Tests show that this model is able to\naccurately predict the cache accesses, cache misses, and network throughput,\nmaking the model useful for future studies on resource provisioning and\nplanning.\n","authors":["Caitlin Sim","Kesheng Wu","Alex Sim","Inder Monga","Chin Guok","Frank Wurthwein","Diego Davila","Harvey Newman","Justas Balcas"],"pdf_url":"https://arxiv.org/pdf/2307.11069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11049v1","updated":"2023-07-20T17:30:37Z","published":"2023-07-20T17:30:37Z","title":"Breadcrumbs to the Goal: Goal-Conditioned Exploration from\n Human-in-the-Loop Feedback","summary":" Exploration and reward specification are fundamental and intertwined\nchallenges for reinforcement learning. Solving sequential decision-making tasks\nrequiring expansive exploration requires either careful design of reward\nfunctions or the use of novelty-seeking exploration bonuses. Human supervisors\ncan provide effective guidance in the loop to direct the exploration process,\nbut prior methods to leverage this guidance require constant synchronous\nhigh-quality human feedback, which is expensive and impractical to obtain. In\nthis work, we present a technique called Human Guided Exploration (HuGE), which\nuses low-quality feedback from non-expert users that may be sporadic,\nasynchronous, and noisy. HuGE guides exploration for reinforcement learning not\nonly in simulation but also in the real world, all without meticulous reward\nspecification. The key concept involves bifurcating human feedback and policy\nlearning: human feedback steers exploration, while self-supervised learning\nfrom the exploration data yields unbiased policies. This procedure can leverage\nnoisy, asynchronous human feedback to learn policies with no hand-crafted\nreward design or exploration bonuses. HuGE is able to learn a variety of\nchallenging multi-stage robotic navigation and manipulation tasks in simulation\nusing crowdsourced feedback from non-expert users. Moreover, this paradigm can\nbe scaled to learning directly on real-world robots, using occasional,\nasynchronous feedback from human supervisors.\n","authors":["Marcel Torne","Max Balsells","Zihan Wang","Samedh Desai","Tao Chen","Pulkit Agrawal","Abhishek Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.11049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11046v1","updated":"2023-07-20T17:28:01Z","published":"2023-07-20T17:28:01Z","title":"A Definition of Continual Reinforcement Learning","summary":" In this paper we develop a foundation for continual reinforcement learning.\n","authors":["David Abel","André Barreto","Benjamin Van Roy","Doina Precup","Hado van Hasselt","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2307.11046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11044v1","updated":"2023-07-20T17:27:29Z","published":"2023-07-20T17:27:29Z","title":"On the Convergence of Bounded Agents","summary":" When has an agent converged? Standard models of the reinforcement learning\nproblem give rise to a straightforward definition of convergence: An agent\nconverges when its behavior or performance in each environment state stops\nchanging. However, as we shift the focus of our learning problem from the\nenvironment's state to the agent's state, the concept of an agent's convergence\nbecomes significantly less clear. In this paper, we propose two complementary\naccounts of agent convergence in a framing of the reinforcement learning\nproblem that centers around bounded agents. The first view says that a bounded\nagent has converged when the minimal number of states needed to describe the\nagent's future behavior cannot decrease. The second view says that a bounded\nagent has converged just when the agent's performance only changes if the\nagent's internal state changes. We establish basic properties of these two\ndefinitions, show that they accommodate typical views of convergence in\nstandard settings, and prove several facts about their nature and relationship.\nWe take these perspectives, definitions, and analysis to bring clarity to a\ncentral idea of the field.\n","authors":["David Abel","André Barreto","Hado van Hasselt","Benjamin Van Roy","Doina Precup","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2307.11044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02719v3","updated":"2023-07-20T17:17:54Z","published":"2023-07-06T01:57:37Z","title":"Understanding Uncertainty Sampling","summary":" Uncertainty sampling is a prevalent active learning algorithm that queries\nsequentially the annotations of data samples which the current prediction model\nis uncertain about. However, the usage of uncertainty sampling has been largely\nheuristic: (i) There is no consensus on the proper definition of \"uncertainty\"\nfor a specific task under a specific loss; (ii) There is no theoretical\nguarantee that prescribes a standard protocol to implement the algorithm, for\nexample, how to handle the sequentially arrived annotated data under the\nframework of optimization algorithms such as stochastic gradient descent. In\nthis work, we systematically examine uncertainty sampling algorithms under both\nstream-based and pool-based active learning. We propose a notion of equivalent\nloss which depends on the used uncertainty measure and the original loss\nfunction and establish that an uncertainty sampling algorithm essentially\noptimizes against such an equivalent loss. The perspective verifies the\nproperness of existing uncertainty measures from two aspects: surrogate\nproperty and loss convexity. Furthermore, we propose a new notion for designing\nuncertainty measures called \\textit{loss as uncertainty}. The idea is to use\nthe conditional expected loss given the features as the uncertainty measure.\nSuch an uncertainty measure has nice analytical properties and generality to\ncover both classification and regression problems, which enable us to provide\nthe first generalization bound for uncertainty sampling algorithms under both\nstream-based and pool-based settings, in the full generality of the underlying\nmodel and problem. Lastly, we establish connections between certain variants of\nthe uncertainty sampling algorithms with risk-sensitive objectives and\ndistributional robustness, which can partly explain the advantage of\nuncertainty sampling algorithms when the sample size is small.\n","authors":["Shang Liu","Xiaocheng Li"],"pdf_url":"https://arxiv.org/pdf/2307.02719v3.pdf","comment":"Update: add numerical illustrations and experiments; correct some\n typos and modify the numbering"},{"id":"http://arxiv.org/abs/2307.11031v1","updated":"2023-07-20T17:07:28Z","published":"2023-07-20T17:07:28Z","title":"Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot\n Classification","summary":" Recent work has shown that language models' (LMs) prompt-based learning\ncapabilities make them well suited for automating data labeling in domains\nwhere manual annotation is expensive. The challenge is that while writing an\ninitial prompt is cheap, improving a prompt is costly -- practitioners often\nrequire significant labeled data in order to evaluate the impact of prompt\nmodifications. Our work asks whether it is possible to improve prompt-based\nlearning without additional labeled data. We approach this problem by\nattempting to modify the predictions of a prompt, rather than the prompt\nitself. Our intuition is that accurate predictions should also be consistent:\nsamples which are similar under some feature representation should receive the\nsame prompt prediction. We propose Embroid, a method which computes multiple\nrepresentations of a dataset under different embedding functions, and uses the\nconsistency between the LM predictions for neighboring samples to identify\nmispredictions. Embroid then uses these neighborhoods to create additional\npredictions for each sample, and combines these predictions with a simple\nlatent variable graphical model in order to generate a final corrected\nprediction. In addition to providing a theoretical analysis of Embroid, we\nconduct a rigorous empirical evaluation across six different LMs and up to 95\ndifferent tasks. We find that (1) Embroid substantially improves performance\nover original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also\nrealizes improvements for more sophisticated prompting strategies (e.g.,\nchain-of-thought), and (3) can be specialized to domains like law through the\nembedding functions.\n","authors":["Neel Guha","Mayee F. Chen","Kush Bhatia","Azalia Mirhoseini","Frederic Sala","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2307.11031v1.pdf","comment":"38 pages, 22 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.11030v1","updated":"2023-07-20T17:05:51Z","published":"2023-07-20T17:05:51Z","title":"Cluster-aware Semi-supervised Learning: Relational Knowledge\n Distillation Provably Learns Clustering","summary":" Despite the empirical success and practical significance of (relational)\nknowledge distillation that matches (the relations of) features between teacher\nand student models, the corresponding theoretical interpretations remain\nlimited for various knowledge distillation paradigms. In this work, we take an\ninitial step toward a theoretical understanding of relational knowledge\ndistillation (RKD), with a focus on semi-supervised classification problems. We\nstart by casting RKD as spectral clustering on a population-induced graph\nunveiled by a teacher model. Via a notion of clustering error that quantifies\nthe discrepancy between the predicted and ground truth clusterings, we\nillustrate that RKD over the population provably leads to low clustering error.\nMoreover, we provide a sample complexity bound for RKD with limited unlabeled\nsamples. For semi-supervised learning, we further demonstrate the label\nefficiency of RKD through a general framework of cluster-aware semi-supervised\nlearning that assumes low clustering errors. Finally, by unifying data\naugmentation consistency regularization into this cluster-aware framework, we\nshow that despite the common effect of learning accurate clusterings, RKD\nfacilitates a \"global\" perspective through spectral clustering, whereas\nconsistency regularization focuses on a \"local\" perspective via expansion.\n","authors":["Yijun Dong","Kevin Miller","Qi Lei","Rachel Ward"],"pdf_url":"https://arxiv.org/pdf/2307.11030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05610v2","updated":"2023-07-20T16:46:36Z","published":"2023-05-09T17:01:17Z","title":"Can point cloud networks learn statistical shape models of anatomies?","summary":" Statistical Shape Modeling (SSM) is a valuable tool for investigating and\nquantifying anatomical variations within populations of anatomies. However,\ntraditional correspondence-based SSM generation methods have a prohibitive\ninference process and require complete geometric proxies (e.g., high-resolution\nbinary volumes or surface meshes) as input shapes to construct the SSM.\nUnordered 3D point cloud representations of shapes are more easily acquired\nfrom various medical imaging practices (e.g., thresholded images and surface\nscanning). Point cloud deep networks have recently achieved remarkable success\nin learning permutation-invariant features for different point cloud tasks\n(e.g., completion, semantic segmentation, classification). However, their\napplication to learning SSM from point clouds is to-date unexplored. In this\nwork, we demonstrate that existing point cloud encoder-decoder-based completion\nnetworks can provide an untapped potential for SSM, capturing population-level\nstatistical representations of shapes while reducing the inference burden and\nrelaxing the input requirement. We discuss the limitations of these techniques\nto the SSM application and suggest future improvements. Our work paves the way\nfor further exploration of point cloud deep learning for SSM, a promising\navenue for advancing shape analysis literature and broadening SSM to diverse\nuse cases.\n","authors":["Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2305.05610v2.pdf","comment":"Accepted to MICCAI 2023. 13 pages, 5 figures, appendix"},{"id":"http://arxiv.org/abs/2307.11018v1","updated":"2023-07-20T16:45:22Z","published":"2023-07-20T16:45:22Z","title":"Amortized Variational Inference: When and Why?","summary":" Amortized variational inference (A-VI) is a method for approximating the\nintractable posterior distributions that arise in probabilistic models. The\ndefining feature of A-VI is that it learns a global inference function that\nmaps each observation to its local latent variable's approximate posterior.\nThis stands in contrast to the more classical factorized (or mean-field)\nvariational inference (F-VI), which directly learns the parameters of the\napproximating distribution for each latent variable. In deep generative models,\nA-VI is used as a computational trick to speed up inference for local latent\nvariables. In this paper, we study A-VI as a general alternative to F-VI for\napproximate posterior inference. A-VI cannot produce an approximation with a\nlower Kullback-Leibler divergence than F-VI's optimal solution, because the\namortized family is a subset of the factorized family. Thus a central\ntheoretical problem is to characterize when A-VI still attains F-VI's optimal\nsolution. We derive conditions on both the model and the inference function\nunder which A-VI can theoretically achieve F-VI's optimum. We show that for a\nbroad class of hierarchical models, including deep generative models, it is\npossible to close the gap between A-VI and F-VI. Further, for an even broader\nclass of models, we establish when and how to expand the domain of the\ninference function to make amortization a feasible strategy. Finally, we prove\nthat for certain models -- including hidden Markov models and Gaussian\nprocesses -- A-VI cannot match F-VI's solution, no matter how expressive the\ninference function is. We also study A-VI empirically [...]\n","authors":["Charles C. Margossian","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.11018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11017v1","updated":"2023-07-20T16:45:16Z","published":"2023-07-20T16:45:16Z","title":"Multi-objective point cloud autoencoders for explainable myocardial\n infarction prediction","summary":" Myocardial infarction (MI) is one of the most common causes of death in the\nworld. Image-based biomarkers commonly used in the clinic, such as ejection\nfraction, fail to capture more complex patterns in the heart's 3D anatomy and\nthus limit diagnostic accuracy. In this work, we present the multi-objective\npoint cloud autoencoder as a novel geometric deep learning approach for\nexplainable infarction prediction, based on multi-class 3D point cloud\nrepresentations of cardiac anatomy and function. Its architecture consists of\nmultiple task-specific branches connected by a low-dimensional latent space to\nallow for effective multi-objective learning of both reconstruction and MI\nprediction, while capturing pathology-specific 3D shape information in an\ninterpretable latent space. Furthermore, its hierarchical branch design with\npoint cloud-based deep learning operations enables efficient multi-scale\nfeature learning directly on high-resolution anatomy point clouds. In our\nexperiments on a large UK Biobank dataset, the multi-objective point cloud\nautoencoder is able to accurately reconstruct multi-temporal 3D shapes with\nChamfer distances between predicted and input anatomies below the underlying\nimages' pixel resolution. Our method outperforms multiple machine learning and\ndeep learning benchmarks for the task of incident MI prediction by 19% in terms\nof Area Under the Receiver Operating Characteristic curve. In addition, its\ntask-specific compact latent space exhibits easily separable control and MI\nclusters with clinically plausible associations between subject encodings and\ncorresponding 3D shapes, thus demonstrating the explainability of the\nprediction.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.11017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.01110v3","updated":"2023-07-20T16:44:47Z","published":"2022-07-03T20:07:00Z","title":"Data-Driven Modeling of Noise Time Series with Convolutional Generative\n Adversarial Networks","summary":" Random noise arising from physical processes is an inherent characteristic of\nmeasurements and a limiting factor for most signal processing and data analysis\ntasks. Given the recent interest in generative adversarial networks (GANs) for\ndata-driven modeling, it is important to determine to what extent GANs can\nfaithfully reproduce noise in target data sets. In this paper, we present an\nempirical investigation that aims to shed light on this issue for time series.\nNamely, we assess two general-purpose GANs for time series that are based on\nthe popular deep convolutional GAN (DCGAN) architecture, a direct time-series\nmodel and an image-based model that uses a short-time Fourier transform (STFT)\ndata representation. The GAN models are trained and quantitatively evaluated\nusing distributions of simulated noise time series with known ground-truth\nparameters. Target time series distributions include a broad range of noise\ntypes commonly encountered in physical measurements, electronics, and\ncommunication systems: band-limited thermal noise, power law noise, shot noise,\nand impulsive noise. We find that GANs are capable of learning many noise\ntypes, although they predictably struggle when the GAN architecture is not well\nsuited to some aspects of the noise, e.g., impulsive time-series with extreme\noutliers. Our findings provide insights into the capabilities and potential\nlimitations of current approaches to time-series GANs and highlight areas for\nfurther research. In addition, our battery of tests provides a useful benchmark\nto aid the development of deep generative models for time series.\n","authors":["Adam Wunderlich","Jack Sklar"],"pdf_url":"https://arxiv.org/pdf/2207.01110v3.pdf","comment":"27 pages, 20 figures"},{"id":"http://arxiv.org/abs/2302.06223v3","updated":"2023-07-20T16:40:14Z","published":"2023-02-13T09:54:50Z","title":"Variational Mixture of HyperGenerators for Learning Distributions Over\n Functions","summary":" Recent approaches build on implicit neural representations (INRs) to propose\ngenerative models over function spaces. However, they are computationally\ncostly when dealing with inference tasks, such as missing data imputation, or\ndirectly cannot tackle them. In this work, we propose a novel deep generative\nmodel, named VAMoH. VAMoH combines the capabilities of modeling continuous\nfunctions using INRs and the inference capabilities of Variational Autoencoders\n(VAEs). In addition, VAMoH relies on a normalizing flow to define the prior,\nand a mixture of hypernetworks to parametrize the data log-likelihood. This\ngives VAMoH a high expressive capability and interpretability. Through\nexperiments on a diverse range of data types, such as images, voxels, and\nclimate data, we show that VAMoH can effectively learn rich distributions over\ncontinuous functions. Furthermore, it can perform inference-related tasks, such\nas conditional super-resolution generation and in-painting, as well or better\nthan previous approaches, while being less computationally demanding.\n","authors":["Batuhan Koyuncu","Pablo Sanchez-Martin","Ignacio Peis","Pablo M. Olmos","Isabel Valera"],"pdf_url":"https://arxiv.org/pdf/2302.06223v3.pdf","comment":"Accepted at ICML 2023. Camera ready version"},{"id":"http://arxiv.org/abs/2012.07881v2","updated":"2023-07-20T16:38:57Z","published":"2020-12-14T19:02:26Z","title":"Perceptron Theory Can Predict the Accuracy of Neural Networks","summary":" Multilayer neural networks set the current state of the art for many\ntechnical classification problems. But, these networks are still, essentially,\nblack boxes in terms of analyzing them and predicting their performance. Here,\nwe develop a statistical theory for the one-layer perceptron and show that it\ncan predict performances of a surprisingly large variety of neural networks\nwith different architectures. A general theory of classification with\nperceptrons is developed by generalizing an existing theory for analyzing\nreservoir computing models and connectionist models for symbolic reasoning\nknown as vector symbolic architectures. Our statistical theory offers three\nformulas leveraging the signal statistics with increasing detail. The formulas\nare analytically intractable, but can be evaluated numerically. The description\nlevel that captures maximum details requires stochastic sampling methods.\nDepending on the network model, the simpler formulas already yield high\nprediction accuracy. The quality of the theory predictions is assessed in three\nexperimental settings, a memorization task for echo state networks (ESNs) from\nreservoir computing literature, a collection of classification datasets for\nshallow randomly connected networks, and the ImageNet dataset for deep\nconvolutional neural networks. We find that the second description level of the\nperceptron theory can predict the performance of types of ESNs, which could not\nbe described previously. The theory can predict deep multilayer neural networks\nby being applied to their output layer. While other methods for prediction of\nneural networks performance commonly require to train an estimator model, the\nproposed theory requires only the first two moments of the distribution of the\npostsynaptic sums in the output neurons. The perceptron theory compares\nfavorably to other methods that do not rely on training an estimator model.\n","authors":["Denis Kleyko","Antonello Rosato","E. Paxon Frady","Massimo Panella","Friedrich T. Sommer"],"pdf_url":"https://arxiv.org/pdf/2012.07881v2.pdf","comment":"16 pages, 14 figures"},{"id":"http://arxiv.org/abs/2307.11013v1","updated":"2023-07-20T16:38:18Z","published":"2023-07-20T16:38:18Z","title":"Flow Map Learning for Unknown Dynamical Systems: Overview,\n Implementation, and Benchmarks","summary":" Flow map learning (FML), in conjunction with deep neural networks (DNNs), has\nshown promises for data driven modeling of unknown dynamical systems. A\nremarkable feature of FML is that it is capable of producing accurate\npredictive models for partially observed systems, even when their exact\nmathematical models do not exist. In this paper, we present an overview of the\nFML framework, along with the important computational details for its\nsuccessful implementation. We also present a set of well defined benchmark\nproblems for learning unknown dynamical systems. All the numerical details of\nthese problems are presented, along with their FML results, to ensure that the\nproblems are accessible for cross-examination and the results are reproducible.\n","authors":["Victor Churchill","Dongbin Xiu"],"pdf_url":"https://arxiv.org/pdf/2307.11013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11011v1","updated":"2023-07-20T16:36:04Z","published":"2023-07-20T16:36:04Z","title":"Neuron Sensitivity Guided Test Case Selection for Deep Learning Testing","summary":" Deep Neural Networks~(DNNs) have been widely deployed in software to address\nvarious tasks~(e.g., autonomous driving, medical diagnosis). However, they\ncould also produce incorrect behaviors that result in financial losses and even\nthreaten human safety. To reveal the incorrect behaviors in DNN and repair\nthem, DNN developers often collect rich unlabeled datasets from the natural\nworld and label them to test the DNN models. However, properly labeling a large\nnumber of unlabeled datasets is a highly expensive and time-consuming task.\n To address the above-mentioned problem, we propose NSS, Neuron Sensitivity\nguided test case Selection, which can reduce the labeling time by selecting\nvaluable test cases from unlabeled datasets. NSS leverages the internal\nneuron's information induced by test cases to select valuable test cases, which\nhave high confidence in causing the model to behave incorrectly. We evaluate\nNSS with four widely used datasets and four well-designed DNN models compared\nto SOTA baseline methods. The results show that NSS performs well in assessing\nthe test cases' probability of fault triggering and model improvement\ncapabilities. Specifically, compared with baseline approaches, NSS obtains a\nhigher fault detection rate~(e.g., when selecting 5\\% test case from the\nunlabeled dataset in MNIST \\& LeNet1 experiment, NSS can obtain 81.8\\% fault\ndetection rate, 20\\% higher than baselines).\n","authors":["Dong Huang","Qingwen Bu","Yichao Fu","Yuhao Qing","Bocheng Xiao","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2307.11011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11007v1","updated":"2023-07-20T16:34:58Z","published":"2023-07-20T16:34:58Z","title":"Sharpness Minimization Algorithms Do Not Only Minimize Sharpness To\n Achieve Better Generalization","summary":" Despite extensive studies, the underlying reason as to why overparameterized\nneural networks can generalize remains elusive. Existing theory shows that\ncommon stochastic optimizers prefer flatter minimizers of the training loss,\nand thus a natural potential explanation is that flatness implies\ngeneralization. This work critically examines this explanation. Through\ntheoretical and empirical investigation, we identify the following three\nscenarios for two-layer ReLU networks: (1) flatness provably implies\ngeneralization; (2) there exist non-generalizing flattest models and sharpness\nminimization algorithms fail to generalize, and (3) perhaps most surprisingly,\nthere exist non-generalizing flattest models, but sharpness minimization\nalgorithms still generalize. Our results suggest that the relationship between\nsharpness and generalization subtly depends on the data distributions and the\nmodel architectures and sharpness minimization algorithms do not only minimize\nsharpness to achieve better generalization. This calls for the search for other\nexplanations for the generalization of over-parameterized neural networks.\n","authors":["Kaiyue Wen","Tengyu Ma","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2307.11007v1.pdf","comment":"34 pages,11 figures"},{"id":"http://arxiv.org/abs/2307.10999v1","updated":"2023-07-20T16:27:51Z","published":"2023-07-20T16:27:51Z","title":"Private Federated Learning with Autotuned Compression","summary":" We propose new techniques for reducing communication in private federated\nlearning without the need for setting or tuning compression rates. Our\non-the-fly methods automatically adjust the compression rate based on the error\ninduced during training, while maintaining provable privacy guarantees through\nthe use of secure aggregation and differential privacy. Our techniques are\nprovably instance-optimal for mean estimation, meaning that they can adapt to\nthe ``hardness of the problem\" with minimal interactivity. We demonstrate the\neffectiveness of our approach on real-world datasets by achieving favorable\ncompression rates without the need for tuning.\n","authors":["Enayat Ullah","Christopher A. Choquette-Choo","Peter Kairouz","Sewoong Oh"],"pdf_url":"https://arxiv.org/pdf/2307.10999v1.pdf","comment":"Accepted to ICML 2023"},{"id":"http://arxiv.org/abs/2307.10997v1","updated":"2023-07-20T16:25:58Z","published":"2023-07-20T16:25:58Z","title":"DREAM: Domain-free Reverse Engineering Attributes of Black-box Model","summary":" Deep learning models are usually black boxes when deployed on machine\nlearning platforms. Prior works have shown that the attributes ($e.g.$, the\nnumber of convolutional layers) of a target black-box neural network can be\nexposed through a sequence of queries. There is a crucial limitation: these\nworks assume the dataset used for training the target model to be known\nbeforehand and leverage this dataset for model attribute attack. However, it is\ndifficult to access the training dataset of the target black-box model in\nreality. Therefore, whether the attributes of a target black-box model could be\nstill revealed in this case is doubtful. In this paper, we investigate a new\nproblem of Domain-agnostic Reverse Engineering the Attributes of a black-box\ntarget Model, called DREAM, without requiring the availability of the target\nmodel's training dataset, and put forward a general and principled framework by\ncasting this problem as an out of distribution (OOD) generalization problem. In\nthis way, we can learn a domain-agnostic model to inversely infer the\nattributes of a target black-box model with unknown training data. This makes\nour method one of the kinds that can gracefully apply to an arbitrary domain\nfor model attribute reverse engineering with strong generalization ability.\nExtensive experimental studies are conducted and the results validate the\nsuperiority of our proposed method over the baselines.\n","authors":["Rongqing Li","Jiaqi Yu","Changsheng Li","Wenhan Luo","Ye Yuan","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10994v1","updated":"2023-07-20T16:25:00Z","published":"2023-07-20T16:25:00Z","title":"Progressive distillation diffusion for raw music generation","summary":" This paper aims to apply a new deep learning approach to the task of\ngenerating raw audio files. It is based on diffusion models, a recent type of\ndeep generative model. This new type of method has recently shown outstanding\nresults with image generation. A lot of focus has been given to those models by\nthe computer vision community. On the other hand, really few have been given\nfor other types of applications such as music generation in waveform domain.\n In this paper the model for unconditional generating applied to music is\nimplemented: Progressive distillation diffusion with 1D U-Net. Then, a\ncomparison of different parameters of diffusion and their value in a full\nresult is presented. One big advantage of the methods implemented through this\nwork is the fact that the model is able to deal with progressing audio\nprocessing and generating , using transformation from 1-channel 128 x 384 to\n3-channel 128 x 128 mel-spectrograms and looped generation. The empirical\ncomparisons are realized across different self-collected datasets.\n","authors":["Svetlana Pavlova"],"pdf_url":"https://arxiv.org/pdf/2307.10994v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2207.12395v3","updated":"2023-07-20T16:21:58Z","published":"2022-07-25T17:58:09Z","title":"Tuning Stochastic Gradient Algorithms for Statistical Inference via\n Large-Sample Asymptotics","summary":" The tuning of stochastic gradient algorithms (SGAs) for optimization and\nsampling is often based on heuristics and trial-and-error rather than\ngeneralizable theory. We address this theory--practice gap by characterizing\nthe large-sample statistical asymptotics of SGAs via a joint\nstep-size--sample-size scaling limit. We show that iterate averaging with a\nlarge fixed step size is robust to the choice of tuning parameters and\nasymptotically has covariance proportional to that of the MLE sampling\ndistribution. We also prove a Bernstein--von Mises-like theorem to guide\ntuning, including for generalized posteriors that are robust to model\nmisspecification. Numerical experiments validate our results and\nrecommendations in realistic finite-sample regimes. Our work lays the\nfoundation for a systematic analysis of other stochastic gradient Markov chain\nMonte Carlo algorithms for a wide range of models.\n","authors":["Jeffrey Negrea","Jun Yang","Haoyue Feng","Daniel M. Roy","Jonathan H. Huggins"],"pdf_url":"https://arxiv.org/pdf/2207.12395v3.pdf","comment":"42 pgs"},{"id":"http://arxiv.org/abs/2307.10988v1","updated":"2023-07-20T16:18:33Z","published":"2023-07-20T16:18:33Z","title":"Investigating minimizing the training set fill distance in machine\n learning regression","summary":" Many machine learning regression methods leverage large datasets for training\npredictive models. However, using large datasets may not be feasible due to\ncomputational limitations or high labelling costs. Therefore, sampling small\ntraining sets from large pools of unlabelled data points is essential to\nmaximize model performance while maintaining computational efficiency. In this\nwork, we study a sampling approach aimed to minimize the fill distance of the\nselected set. We derive an upper bound for the maximum expected prediction\nerror that linearly depends on the training set fill distance, conditional to\nthe knowledge of data features. For empirical validation, we perform\nexperiments using two regression models on two datasets. We empirically show\nthat selecting a training set by aiming to minimize the fill distance, thereby\nminimizing the bound, significantly reduces the maximum prediction error of\nvarious regression models, outperforming existing sampling approaches by a\nlarge margin.\n","authors":["Paolo Climaco","Jochen Garcke"],"pdf_url":"https://arxiv.org/pdf/2307.10988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09943v2","updated":"2023-07-20T16:11:39Z","published":"2023-07-19T12:35:16Z","title":"Impatient Bandits: Optimizing Recommendations for the Long-Term Without\n Delay","summary":" Recommender systems are a ubiquitous feature of online platforms.\nIncreasingly, they are explicitly tasked with increasing users' long-term\nsatisfaction. In this context, we study a content exploration task, which we\nformalize as a multi-armed bandit problem with delayed rewards. We observe that\nthere is an apparent trade-off in choosing the learning signal: Waiting for the\nfull reward to become available might take several weeks, hurting the rate at\nwhich learning happens, whereas measuring short-term proxy rewards reflects the\nactual long-term goal only imperfectly. We address this challenge in two steps.\nFirst, we develop a predictive model of delayed rewards that incorporates all\ninformation obtained to date. Full observations as well as partial (short or\nmedium-term) outcomes are combined through a Bayesian filter to obtain a\nprobabilistic belief. Second, we devise a bandit algorithm that takes advantage\nof this new predictive model. The algorithm quickly learns to identify content\naligned with long-term success by carefully balancing exploration and\nexploitation. We apply our approach to a podcast recommendation problem, where\nwe seek to identify shows that users engage with repeatedly over two months. We\nempirically validate that our approach results in substantially better\nperformance compared to approaches that either optimize for short-term proxies,\nor wait for the long-term outcome to be fully realized.\n","authors":["Thomas M. McDonald","Lucas Maystre","Mounia Lalmas","Daniel Russo","Kamil Ciosek"],"pdf_url":"https://arxiv.org/pdf/2307.09943v2.pdf","comment":"Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery\n and Data Mining (KDD '23)"},{"id":"http://arxiv.org/abs/2307.10982v1","updated":"2023-07-20T16:09:57Z","published":"2023-07-20T16:09:57Z","title":"MASR: Metadata Aware Speech Representation","summary":" In the recent years, speech representation learning is constructed primarily\nas a self-supervised learning (SSL) task, using the raw audio signal alone,\nwhile ignoring the side-information that is often available for a given speech\nrecording. In this paper, we propose MASR, a Metadata Aware Speech\nRepresentation learning framework, which addresses the aforementioned\nlimitations. MASR enables the inclusion of multiple external knowledge sources\nto enhance the utilization of meta-data information. The external knowledge\nsources are incorporated in the form of sample-level pair-wise similarity\nmatrices that are useful in a hard-mining loss. A key advantage of the MASR\nframework is that it can be combined with any choice of SSL method. Using MASR\nrepresentations, we perform evaluations on several downstream tasks such as\nlanguage identification, speech recognition and other non-semantic tasks such\nas speaker and emotion recognition. In these experiments, we illustrate\nsignificant performance improvements for the MASR over other established\nbenchmarks. We perform a detailed analysis on the language identification task\nto provide insights on how the proposed loss function enables the\nrepresentations to separate closely related languages.\n","authors":["Anjali Raj","Shikhar Bharadwaj","Sriram Ganapathy","Min Ma","Shikhar Vashishth"],"pdf_url":"https://arxiv.org/pdf/2307.10982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10981v1","updated":"2023-07-20T16:09:07Z","published":"2023-07-20T16:09:07Z","title":"PATROL: Privacy-Oriented Pruning for Collaborative Inference Against\n Model Inversion Attacks","summary":" Collaborative inference has been a promising solution to enable\nresource-constrained edge devices to perform inference using state-of-the-art\ndeep neural networks (DNNs). In collaborative inference, the edge device first\nfeeds the input to a partial DNN locally and then uploads the intermediate\nresult to the cloud to complete the inference. However, recent research\nindicates model inversion attacks (MIAs) can reconstruct input data from\nintermediate results, posing serious privacy concerns for collaborative\ninference. Existing perturbation and cryptography techniques are inefficient\nand unreliable in defending against MIAs while performing accurate inference.\nThis paper provides a viable solution, named PATROL, which develops\nprivacy-oriented pruning to balance privacy, efficiency, and utility of\ncollaborative inference. PATROL takes advantage of the fact that later layers\nin a DNN can extract more task-specific features. Given limited local resources\nfor collaborative inference, PATROL intends to deploy more layers at the edge\nbased on pruning techniques to enforce task-specific features for inference and\nreduce task-irrelevant but sensitive features for privacy preservation. To\nachieve privacy-oriented pruning, PATROL introduces two key components:\nLipschitz regularization and adversarial reconstruction training, which\nincrease the reconstruction errors by reducing the stability of MIAs and\nenhance the target inference model by adversarial training, respectively.\n","authors":["Shiwei Ding","Lan Zhang","Miao Pan","Xiaoyong Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.10981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03017v3","updated":"2023-07-20T16:05:39Z","published":"2023-05-04T17:43:19Z","title":"Improving Code Example Recommendations on Informal Documentation Using\n BERT and Query-Aware LSH: A Comparative Study","summary":" Our research investigates the recommendation of code examples to aid software\ndevelopers, a practice that saves developers significant time by providing\nready-to-use code snippets. The focus of our study is Stack Overflow, a\ncommonly used resource for coding discussions and solutions, particularly in\nthe context of the Java programming language. We applied BERT, a powerful Large\nLanguage Model (LLM) that enables us to transform code examples into numerical\nvectors by extracting their semantic information. Once these numerical\nrepresentations are prepared, we identify Approximate Nearest Neighbors (ANN)\nusing Locality-Sensitive Hashing (LSH). Our research employed two variants of\nLSH: Random Hyperplane-based LSH and Query-Aware LSH. We rigorously compared\nthese two approaches across four parameters: HitRate, Mean Reciprocal Rank\n(MRR), Average Execution Time, and Relevance. Our study revealed that the\nQuery-Aware (QA) approach showed superior performance over the Random\nHyperplane-based (RH) method. Specifically, it exhibited a notable improvement\nof 20% to 35% in HitRate for query pairs compared to the RH approach.\nFurthermore, the QA approach proved significantly more time-efficient, with its\nspeed in creating hashing tables and assigning data samples to buckets being at\nleast four times faster. It can return code examples within milliseconds,\nwhereas the RH approach typically requires several seconds to recommend code\nexamples. Due to the superior performance of the QA approach, we tested it\nagainst PostFinder and FaCoY, the state-of-the-art baselines. Our QA method\nshowed comparable efficiency proving its potential for effective code\nrecommendation.\n","authors":["Sajjad Rahmani","AmirHossein Naghshzan","Latifa Guerrouj"],"pdf_url":"https://arxiv.org/pdf/2305.03017v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12619v2","updated":"2023-07-20T16:04:19Z","published":"2023-06-22T01:14:47Z","title":"Class-Incremental Learning based on Label Generation","summary":" Despite the great success of pre-trained language models, it is still a\nchallenge to use these models for continual learning, especially for the\nclass-incremental learning (CIL) setting due to catastrophic forgetting (CF).\nThis paper reports our finding that if we formulate CIL as a continual label\ngeneration problem, CF is drastically reduced and the generalizable\nrepresentations of pre-trained models can be better retained. We thus propose a\nnew CIL method (VAG) that also leverages the sparsity of vocabulary to focus\nthe generation and creates pseudo-replay samples by using label semantics.\nExperimental results show that VAG outperforms baselines by a large margin.\n","authors":["Yijia Shao","Yiduo Guo","Dongyan Zhao","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.12619v2.pdf","comment":"12 pages, ACL 2023 Main Conference"},{"id":"http://arxiv.org/abs/2307.10975v1","updated":"2023-07-20T16:04:07Z","published":"2023-07-20T16:04:07Z","title":"Globally Normalising the Transducer for Streaming Speech Recognition","summary":" The Transducer (e.g. RNN-Transducer or Conformer-Transducer) generates an\noutput label sequence as it traverses the input sequence. It is straightforward\nto use in streaming mode, where it generates partial hypotheses before the\ncomplete input has been seen. This makes it popular in speech recognition.\nHowever, in streaming mode the Transducer has a mathematical flaw which, simply\nput, restricts the model's ability to change its mind. The fix is to replace\nlocal normalisation (e.g. a softmax) with global normalisation, but then the\nloss function becomes impossible to evaluate exactly. A recent paper proposes\nto solve this by approximating the model, severely degrading performance.\nInstead, this paper proposes to approximate the loss function, allowing global\nnormalisation to apply to a state-of-the-art streaming model. Global\nnormalisation reduces its word error rate by 9-11% relative, closing almost\nhalf the gap between streaming and lookahead mode.\n","authors":["Rogier van Dalen"],"pdf_url":"https://arxiv.org/pdf/2307.10975v1.pdf","comment":"9 pages plus references and appendices"},{"id":"http://arxiv.org/abs/2210.06089v2","updated":"2023-07-20T16:01:03Z","published":"2022-10-12T11:04:22Z","title":"When are Local Queries Useful for Robust Learning?","summary":" Distributional assumptions have been shown to be necessary for the robust\nlearnability of concept classes when considering the exact-in-the-ball robust\nrisk and access to random examples by Gourdeau et al. (2019). In this paper, we\nstudy learning models where the learner is given more power through the use of\nlocal queries, and give the first distribution-free algorithms that perform\nrobust empirical risk minimization (ERM) for this notion of robustness. The\nfirst learning model we consider uses local membership queries (LMQ), where the\nlearner can query the label of points near the training sample. We show that,\nunder the uniform distribution, LMQs do not increase the robustness threshold\nof conjunctions and any superclass, e.g., decision lists and halfspaces. Faced\nwith this negative result, we introduce the local equivalence query\n($\\mathsf{LEQ}$) oracle, which returns whether the hypothesis and target\nconcept agree in the perturbation region around a point in the training sample,\nas well as a counterexample if it exists. We show a separation result: on the\none hand, if the query radius $\\lambda$ is strictly smaller than the\nadversary's perturbation budget $\\rho$, then distribution-free robust learning\nis impossible for a wide variety of concept classes; on the other hand, the\nsetting $\\lambda=\\rho$ allows us to develop robust ERM algorithms. We then\nbound the query complexity of these algorithms based on online learning\nguarantees and further improve these bounds for the special case of\nconjunctions. We finish by giving robust learning algorithms for halfspaces on\n$\\{0,1\\}^n$ and then obtaining robustness guarantees for halfspaces in\n$\\mathbb{R}^n$ against precision-bounded adversaries.\n","authors":["Pascale Gourdeau","Varun Kanade","Marta Kwiatkowska","James Worrell"],"pdf_url":"https://arxiv.org/pdf/2210.06089v2.pdf","comment":"Accepted to NeurIPS 2022; V2 contains new results (Section 3.6) and\n an erratum from the previous version (Appendix C)"},{"id":"http://arxiv.org/abs/2204.06362v2","updated":"2023-07-20T15:48:35Z","published":"2022-04-13T13:16:21Z","title":"A Review of Machine Learning Methods Applied to Structural Dynamics and\n Vibroacoustic","summary":" The use of Machine Learning (ML) has rapidly spread across several fields,\nhaving encountered many applications in Structural Dynamics and Vibroacoustic\n(SD\\&V). The increasing capabilities of ML to unveil insights from data, driven\nby unprecedented data availability, algorithms advances and computational\npower, enhance decision making, uncertainty handling, patterns recognition and\nreal-time assessments. Three main applications in SD\\&V have taken advantage of\nthese benefits. In Structural Health Monitoring, ML detection and prognosis\nlead to safe operation and optimized maintenance schedules. System\nidentification and control design are leveraged by ML techniques in Active\nNoise Control and Active Vibration Control. Finally, the so-called ML-based\nsurrogate models provide fast alternatives to costly simulations, enabling\nrobust and optimized product design. Despite the many works in the area, they\nhave not been reviewed and analyzed. Therefore, to keep track and understand\nthis ongoing integration of fields, this paper presents a survey of ML\napplications in SD\\&V analyses, shedding light on the current state of\nimplementation and emerging opportunities. The main methodologies, advantages,\nlimitations, and recommendations based on scientific knowledge were identified\nfor each of the three applications. Moreover, the paper considers the role of\nDigital Twins and Physics Guided ML to overcome current challenges and power\nfuture research progress. As a result, the survey provides a broad overview of\nthe present landscape of ML applied in SD\\&V and guides the reader to an\nadvanced understanding of progress and prospects in the field.\n","authors":["Barbara Cunha","Christophe Droz","Abdelmalek Zine","Stéphane Foulard","Mohamed Ichchou"],"pdf_url":"https://arxiv.org/pdf/2204.06362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10936v1","updated":"2023-07-20T15:09:06Z","published":"2023-07-20T15:09:06Z","title":"PASTA: Pretrained Action-State Transformer Agents","summary":" Self-supervised learning has brought about a revolutionary paradigm shift in\nvarious computing domains, including NLP, vision, and biology. Recent\napproaches involve pre-training transformer models on vast amounts of unlabeled\ndata, serving as a starting point for efficiently solving downstream tasks. In\nthe realm of reinforcement learning, researchers have recently adapted these\napproaches by developing models pre-trained on expert trajectories, enabling\nthem to address a wide range of tasks, from robotics to recommendation systems.\nHowever, existing methods mostly rely on intricate pre-training objectives\ntailored to specific downstream applications. This paper presents a\ncomprehensive investigation of models we refer to as Pretrained Action-State\nTransformer Agents (PASTA). Our study uses a unified methodology and covers an\nextensive set of general downstream tasks including behavioral cloning, offline\nRL, sensor failure robustness, and dynamics change adaptation. Our goal is to\nsystematically compare various design choices and provide valuable insights to\npractitioners for building robust models. Key highlights of our study include\ntokenization at the action and state component level, using fundamental\npre-training objectives like next token prediction, training models across\ndiverse domains simultaneously, and using parameter efficient fine-tuning\n(PEFT). The developed models in our study contain fewer than 10 million\nparameters and the application of PEFT enables fine-tuning of fewer than 10,000\nparameters during downstream adaptation, allowing a broad community to use\nthese models and reproduce our experiments. We hope that this study will\nencourage further research into the use of transformers with first-principles\ndesign choices to represent RL trajectories and contribute to robust policy\nlearning.\n","authors":["Raphael Boige","Yannis Flet-Berliac","Arthur Flajolet","Guillaume Richard","Thomas Pierrot"],"pdf_url":"https://arxiv.org/pdf/2307.10936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10935v1","updated":"2023-07-20T15:07:49Z","published":"2023-07-20T15:07:49Z","title":"Inorganic synthesis-structure maps in zeolites with machine learning and\n crystallographic distances","summary":" Zeolites are inorganic materials known for their diversity of applications,\nsynthesis conditions, and resulting polymorphs. Although their synthesis is\ncontrolled both by inorganic and organic synthesis conditions, computational\nstudies of zeolite synthesis have focused mostly on organic template design. In\nthis work, we use a strong distance metric between crystal structures and\nmachine learning (ML) to create inorganic synthesis maps in zeolites. Starting\nwith 253 known zeolites, we show how the continuous distances between\nframeworks reproduce inorganic synthesis conditions from the literature without\nusing labels such as building units. An unsupervised learning analysis shows\nthat neighboring zeolites according to our metric often share similar inorganic\nsynthesis conditions, even in template-based routes. In combination with ML\nclassifiers, we find synthesis-structure relationships for 14 common inorganic\nconditions in zeolites, namely Al, B, Be, Ca, Co, F, Ga, Ge, K, Mg, Na, P, Si,\nand Zn. By explaining the model predictions, we demonstrate how\n(dis)similarities towards known structures can be used as features for the\nsynthesis space. Finally, we show how these methods can be used to predict\ninorganic synthesis conditions for unrealized frameworks in hypothetical\ndatabases and interpret the outcomes by extracting local structural patterns\nfrom zeolites. In combination with template design, this work can accelerate\nthe exploration of the space of synthesis conditions for zeolites.\n","authors":["Daniel Schwalbe-Koda","Daniel E. Widdowson","Tuan Anh Pham","Vitaliy A. Kurlin"],"pdf_url":"https://arxiv.org/pdf/2307.10935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10927v1","updated":"2023-07-20T14:56:29Z","published":"2023-07-20T14:56:29Z","title":"Modeling 3D cardiac contraction and relaxation with point cloud\n deformation networks","summary":" Global single-valued biomarkers of cardiac function typically used in\nclinical practice, such as ejection fraction, provide limited insight on the\ntrue 3D cardiac deformation process and hence, limit the understanding of both\nhealthy and pathological cardiac mechanics. In this work, we propose the Point\nCloud Deformation Network (PCD-Net) as a novel geometric deep learning approach\nto model 3D cardiac contraction and relaxation between the extreme ends of the\ncardiac cycle. It employs the recent advances in point cloud-based deep\nlearning into an encoder-decoder structure, in order to enable efficient\nmulti-scale feature learning directly on multi-class 3D point cloud\nrepresentations of the cardiac anatomy. We evaluate our approach on a large\ndataset of over 10,000 cases from the UK Biobank study and find average Chamfer\ndistances between the predicted and ground truth anatomies below the pixel\nresolution of the underlying image acquisition. Furthermore, we observe similar\nclinical metrics between predicted and ground truth populations and show that\nthe PCD-Net can successfully capture subpopulation-specific differences between\nnormal subjects and myocardial infarction (MI) patients. We then demonstrate\nthat the learned 3D deformation patterns outperform multiple clinical\nbenchmarks by 13% and 7% in terms of area under the receiver operating\ncharacteristic curve for the tasks of prevalent MI detection and incident MI\nprediction and by 7% in terms of Harrell's concordance index for MI survival\nanalysis.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.10927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10926v1","updated":"2023-07-20T14:52:45Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n segmentation","summary":" Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquax","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.10923v1","updated":"2023-07-20T14:49:58Z","published":"2023-07-20T14:49:58Z","title":"Sequential Multi-Dimensional Self-Supervised Learning for Clinical Time\n Series","summary":" Self-supervised learning (SSL) for clinical time series data has received\nsignificant attention in recent literature, since these data are highly rich\nand provide important information about a patient's physiological state.\nHowever, most existing SSL methods for clinical time series are limited in that\nthey are designed for unimodal time series, such as a sequence of structured\nfeatures (e.g., lab values and vitals signs) or an individual high-dimensional\nphysiological signal (e.g., an electrocardiogram). These existing methods\ncannot be readily extended to model time series that exhibit multimodality,\nwith structured features and high-dimensional data being recorded at each\ntimestep in the sequence. In this work, we address this gap and propose a new\nSSL method -- Sequential Multi-Dimensional SSL -- where a SSL loss is applied\nboth at the level of the entire sequence and at the level of the individual\nhigh-dimensional data points in the sequence in order to better capture\ninformation at both scales. Our strategy is agnostic to the specific form of\nloss function used at each level -- it can be contrastive, as in SimCLR, or\nnon-contrastive, as in VICReg. We evaluate our method on two real-world\nclinical datasets, where the time series contains sequences of (1)\nhigh-frequency electrocardiograms and (2) structured data from lab values and\nvitals signs. Our experimental results indicate that pre-training with our\nmethod and then fine-tuning on downstream tasks improves performance over\nbaselines on both datasets, and in several settings, can lead to improvements\nacross different self-supervised loss functions.\n","authors":["Aniruddh Raghu","Payal Chandak","Ridwan Alam","John Guttag","Collin M. Stultz"],"pdf_url":"https://arxiv.org/pdf/2307.10923v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.10922v1","updated":"2023-07-20T14:47:50Z","published":"2023-07-20T14:47:50Z","title":"Language-based Action Concept Spaces Improve Video Self-Supervised\n Learning","summary":" Recent contrastive language image pre-training has led to learning highly\ntransferable and robust image representations. However, adapting these models\nto video domains with minimal supervision remains an open problem. We explore a\nsimple step in that direction, using language tied self-supervised learning to\nadapt an image CLIP model to the video domain. A backbone modified for temporal\nmodeling is trained under self-distillation settings with train objectives\noperating in an action concept space. Feature vectors of various action\nconcepts extracted from a language encoder using relevant textual prompts\nconstruct this space. We introduce two train objectives, concept distillation\nand concept alignment, that retain generality of original representations while\nenforcing relations between actions and their attributes. Our approach improves\nzero-shot and linear probing performance on three action recognition\nbenchmarks.\n","authors":["Kanchana Ranasinghe","Michael Ryoo"],"pdf_url":"https://arxiv.org/pdf/2307.10922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14319v3","updated":"2023-07-20T14:37:12Z","published":"2022-12-29T14:28:32Z","title":"Gaussian Process Priors for Systems of Linear Partial Differential\n Equations with Constant Coefficients","summary":" Partial differential equations (PDEs) are important tools to model physical\nsystems and including them into machine learning models is an important way of\nincorporating physical knowledge. Given any system of linear PDEs with constant\ncoefficients, we propose a family of Gaussian process (GP) priors, which we\ncall EPGP, such that all realizations are exact solutions of this system. We\napply the Ehrenpreis-Palamodov fundamental principle, which works as a\nnon-linear Fourier transform, to construct GP kernels mirroring standard\nspectral methods for GPs. Our approach can infer probable solutions of linear\nPDE systems from any data such as noisy measurements, or pointwise defined\ninitial and boundary conditions. Constructing EPGP-priors is algorithmic,\ngenerally applicable, and comes with a sparse version (S-EPGP) that learns the\nrelevant spectral frequencies and works better for big data sets. We\ndemonstrate our approach on three families of systems of PDEs, the heat\nequation, wave equation, and Maxwell's equations, where we improve upon the\nstate of the art in computation time and precision, in some experiments by\nseveral orders of magnitude.\n","authors":["Marc Härkönen","Markus Lange-Hegermann","Bogdan Raiţă"],"pdf_url":"https://arxiv.org/pdf/2212.14319v3.pdf","comment":"26 pages, 8 figures; ICML 2023 (oral); updated with expanded\n appendices and ancillary files. Code available at\n https://github.com/haerski/EPGP. For animations, see\n https://mathrepo.mis.mpg.de/EPGP/index.html"},{"id":"http://arxiv.org/abs/2307.00405v2","updated":"2023-07-20T14:36:11Z","published":"2023-07-01T18:35:21Z","title":"Provably Efficient UCB-type Algorithms For Learning Predictive State\n Representations","summary":" The general sequential decision-making problem, which includes Markov\ndecision processes (MDPs) and partially observable MDPs (POMDPs) as special\ncases, aims at maximizing a cumulative reward by making a sequence of decisions\nbased on a history of observations and actions over time. Recent studies have\nshown that the sequential decision-making problem is statistically learnable if\nit admits a low-rank structure modeled by predictive state representations\n(PSRs). Despite these advancements, existing approaches typically involve\noracles or steps that are not computationally efficient. On the other hand, the\nupper confidence bound (UCB) based approaches, which have served successfully\nas computationally efficient methods in bandits and MDPs, have not been\ninvestigated for more general PSRs, due to the difficulty of optimistic bonus\ndesign in these more challenging settings. This paper proposes the first known\nUCB-type approach for PSRs, featuring a novel bonus term that upper bounds the\ntotal variation distance between the estimated and true models. We further\ncharacterize the sample complexity bounds for our designed UCB-type algorithms\nfor both online and offline PSRs. In contrast to existing approaches for PSRs,\nour UCB-type algorithms enjoy computational efficiency, last-iterate guaranteed\nnear-optimal policy, and guaranteed model accuracy.\n","authors":["Ruiquan Huang","Yingbin Liang","Jing Yang"],"pdf_url":"https://arxiv.org/pdf/2307.00405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10907v1","updated":"2023-07-20T14:29:51Z","published":"2023-07-20T14:29:51Z","title":"The Role of Entropy and Reconstruction in Multi-View Self-Supervised\n Learning","summary":" The mechanisms behind the success of multi-view self-supervised learning\n(MVSSL) are not yet fully understood. Contrastive MVSSL methods have been\nstudied through the lens of InfoNCE, a lower bound of the Mutual Information\n(MI). However, the relation between other MVSSL methods and MI remains unclear.\nWe consider a different lower bound on the MI consisting of an entropy and a\nreconstruction term (ER), and analyze the main MVSSL families through its lens.\nThrough this ER bound, we show that clustering-based methods such as\nDeepCluster and SwAV maximize the MI. We also re-interpret the mechanisms of\ndistillation-based approaches such as BYOL and DINO, showing that they\nexplicitly maximize the reconstruction term and implicitly encourage a stable\nentropy, and we confirm this empirically. We show that replacing the objectives\nof common MVSSL methods with this ER bound achieves competitive performance,\nwhile making them stable when training with smaller batch sizes or smaller\nexponential moving average (EMA) coefficients.\n Github repo: https://github.com/apple/ml-entropy-reconstruction.\n","authors":["Borja Rodríguez-Gálvez","Arno Blaas","Pau Rodríguez","Adam Goliński","Xavier Suau","Jason Ramapuram","Dan Busbridge","Luca Zappella"],"pdf_url":"https://arxiv.org/pdf/2307.10907v1.pdf","comment":"18 pages: 9 of main text, 2 of references, and 7 of supplementary\n material. Appears in the proceedings of ICML 2023"},{"id":"http://arxiv.org/abs/2110.05216v2","updated":"2023-07-20T14:29:07Z","published":"2021-10-11T12:32:56Z","title":"High-order Tensor Pooling with Attention for Action Recognition","summary":" We aim at capturing high-order statistics of feature vectors formed by a\nneural network, and propose end-to-end second- and higher-order pooling to form\na tensor descriptor. Tensor descriptors require a robust similarity measure due\nto low numbers of aggregated vectors and the burstiness phenomenon, when a\ngiven feature appears more/less frequently than statistically expected. The\nHeat Diffusion Process (HDP) on a graph Laplacian is closely related to the\nEigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix,\nwhose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN\nplay the same role, i.e., to boost or dampen the magnitude of the eigenspectrum\nthus preventing the burstiness. We equip higher-order tensors with EPN which\nacts as a spectral detector of higher-order occurrences to prevent burstiness.\nWe also prove that for a tensor of order r built from d dimensional feature\ndescriptors, such a detector gives the likelihood if at least one higher-order\noccurrence is 'projected' into one of binom(d,r) subspaces represented by the\ntensor; thus forming a tensor power normalization metric endowed with\nbinom(d,r) such 'detectors'. For experimental contributions, we apply several\nsecond- and higher-order pooling variants to action recognition, provide\npreviously not presented comparisons of such pooling variants, and show\nstate-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities.\n","authors":["Piotr Koniusz","Lei Wang","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2110.05216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v1","updated":"2023-07-20T14:18:44Z","published":"2023-07-20T14:18:44Z","title":"Variational Point Encoding Deformation for Dental Modeling","summary":" Digital dentistry has made significant advancements in recent years, yet\nnumerous challenges remain to be addressed. In this study, we release a new\nextensive dataset of tooth meshes to encourage further research. Additionally,\nwe propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable\nprobabilistic learning of point cloud representations. A key challenge in\nexisting latent variable models for point clouds is the lack of a 1-to-1\nmapping between input points and output points. Instead, they must rely on\noptimizing Chamfer distances, a metric that does not have a normalized\ndistributional counterpart, preventing its usage in probabilistic models. We\ndemonstrate that explicit minimization of Chamfer distances can be replaced by\na suitable encoder, which allows us to increase computational efficiency while\nsimplifying the probabilistic extension. Our experimental findings present\nempirical evidence demonstrating the superior performance of VF-Net over\nexisting models in terms of dental scan reconstruction and extrapolation.\nAdditionally, our investigation highlights the robustness of VF-Net's latent\nrepresentations. These results underscore the promising prospects of VF-Net as\nan effective and reliable method for point cloud reconstruction and analysis.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10892v1","updated":"2023-07-20T14:11:29Z","published":"2023-07-20T14:11:29Z","title":"Learning and Generalizing Polynomials in Simulation Metamodeling","summary":" The ability to learn polynomials and generalize out-of-distribution is\nessential for simulation metamodels in many disciplines of engineering, where\nthe time step updates are described by polynomials. While feed forward neural\nnetworks can fit any function, they cannot generalize out-of-distribution for\nhigher-order polynomials. Therefore, this paper collects and proposes\nmultiplicative neural network (MNN) architectures that are used as recursive\nbuilding blocks for approximating higher-order polynomials. Our experiments\nshow that MNNs are better than baseline models at generalizing, and their\nperformance in validation is true to their performance in out-of-distribution\ntests. In addition to MNN architectures, a simulation metamodeling approach is\nproposed for simulations with polynomial time step updates. For these\nsimulations, simulating a time interval can be performed in fewer steps by\nincreasing the step size, which entails approximating higher-order polynomials.\nWhile our approach is compatible with any simulation with polynomial time step\nupdates, a demonstration is shown for an epidemiology simulation model, which\nalso shows the inductive bias in MNNs for learning and generalizing\nhigher-order polynomials.\n","authors":["Jesper Hauch","Christoffer Riis","Francisco C. Pereira"],"pdf_url":"https://arxiv.org/pdf/2307.10892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10891v1","updated":"2023-07-20T14:10:40Z","published":"2023-07-20T14:10:40Z","title":"Syntactic vs Semantic Linear Abstraction and Refinement of Neural\n Networks","summary":" Abstraction is a key verification technique to improve scalability. However,\nits use for neural networks is so far extremely limited. Previous approaches\nfor abstracting classification networks replace several neurons with one of\nthem that is similar enough. We can classify the similarity as defined either\nsyntactically (using quantities on the connections between neurons) or\nsemantically (on the activation values of neurons for various inputs).\nUnfortunately, the previous approaches only achieve moderate reductions, when\nimplemented at all. In this work, we provide a more flexible framework where a\nneuron can be replaced with a linear combination of other neurons, improving\nthe reduction. We apply this approach both on syntactic and semantic\nabstractions, and implement and evaluate them experimentally. Further, we\nintroduce a refinement method for our abstractions, allowing for finding a\nbetter balance between reduction and precision.\n","authors":["Calvin Chau","Jan Křetínský","Stefanie Mohr"],"pdf_url":"https://arxiv.org/pdf/2307.10891v1.pdf","comment":"Accepted at ATVA 2023"},{"id":"http://arxiv.org/abs/2307.10890v1","updated":"2023-07-20T14:10:33Z","published":"2023-07-20T14:10:33Z","title":"Player-optimal Stable Regret for Bandit Learning in Matching Markets","summary":" The problem of matching markets has been studied for a long time in the\nliterature due to its wide range of applications. Finding a stable matching is\na common equilibrium objective in this problem. Since market participants are\nusually uncertain of their preferences, a rich line of recent works study the\nonline setting where one-side participants (players) learn their unknown\npreferences from iterative interactions with the other side (arms). Most\nprevious works in this line are only able to derive theoretical guarantees for\nplayer-pessimal stable regret, which is defined compared with the players'\nleast-preferred stable matching. However, under the pessimal stable matching,\nplayers only obtain the least reward among all stable matchings. To maximize\nplayers' profits, player-optimal stable matching would be the most desirable.\nThough \\citet{basu21beyond} successfully bring an upper bound for\nplayer-optimal stable regret, their result can be exponentially large if\nplayers' preference gap is small. Whether a polynomial guarantee for this\nregret exists is a significant but still open problem. In this work, we provide\na new algorithm named explore-then-Gale-Shapley (ETGS) and show that the\noptimal stable regret of each player can be upper bounded by $O(K\\log\nT/\\Delta^2)$ where $K$ is the number of arms, $T$ is the horizon and $\\Delta$\nis the players' minimum preference gap among the first $N+1$-ranked arms. This\nresult significantly improves previous works which either have a weaker\nplayer-pessimal stable matching objective or apply only to markets with special\nassumptions. When the preferences of participants satisfy some special\nconditions, our regret upper bound also matches the previously derived lower\nbound.\n","authors":["Fang Kong","Shuai Li"],"pdf_url":"https://arxiv.org/pdf/2307.10890v1.pdf","comment":"SODA 2023"},{"id":"http://arxiv.org/abs/2307.02405v2","updated":"2023-07-20T14:10:24Z","published":"2023-07-05T16:27:33Z","title":"$ν^2$-Flows: Fast and improved neutrino reconstruction in\n multi-neutrino final states with conditional normalizing flows","summary":" In this work we introduce $\\nu^2$-Flows, an extension of the $\\nu$-Flows\nmethod to final states containing multiple neutrinos. The architecture can\nnatively scale for all combinations of object types and multiplicities in the\nfinal state for any desired neutrino multiplicities. In $t\\bar{t}$ dilepton\nevents, the momenta of both neutrinos and correlations between them are\nreconstructed more accurately than when using the most popular standard\nanalytical techniques, and solutions are found for all events. Inference time\nis significantly faster than competing methods, and can be reduced further by\nevaluating in parallel on graphics processing units. We apply $\\nu^2$-Flows to\n$t\\bar{t}$ dilepton events and show that the per-bin uncertainties in unfolded\ndistributions is much closer to the limit of performance set by perfect\nneutrino reconstruction than standard techniques. For the chosen double\ndifferential observables $\\nu^2$-Flows results in improved statistical\nprecision for each bin by a factor of 1.5 to 2 in comparison to the Neutrino\nWeighting method and up to a factor of four in comparison to the Ellipse\napproach.\n","authors":["John Andrew Raine","Matthew Leigh","Knut Zoch","Tobias Golling"],"pdf_url":"https://arxiv.org/pdf/2307.02405v2.pdf","comment":"20 pages, 16 figures, 5 tables"},{"id":"http://arxiv.org/abs/2303.16716v2","updated":"2023-07-20T13:54:48Z","published":"2023-03-29T14:15:38Z","title":"Topological Point Cloud Clustering","summary":" We present Topological Point Cloud Clustering (TPCC), a new method to cluster\npoints in an arbitrary point cloud based on their contribution to global\ntopological features. TPCC synthesizes desirable features from spectral\nclustering and topological data analysis and is based on considering the\nspectral properties of a simplicial complex associated to the considered point\ncloud. As it is based on considering sparse eigenvector computations, TPCC is\nsimilarly easy to interpret and implement as spectral clustering. However, by\nfocusing not just on a single matrix associated to a graph created from the\npoint cloud data, but on a whole set of Hodge-Laplacians associated to an\nappropriately constructed simplicial complex, we can leverage a far richer set\nof topological features to characterize the data points within the point cloud\nand benefit from the relative robustness of topological techniques against\nnoise. We test the performance of TPCC on both synthetic and real-world data\nand compare it with classical spectral clustering.\n","authors":["Vincent P. Grande","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2303.16716v2.pdf","comment":"Accepted at the 40th International Conference on Machine Learning\n (ICML), 2023. Code available at\n https://git.rwth-aachen.de/netsci/publication-2023-topological-point-cloud-clustering"},{"id":"http://arxiv.org/abs/2306.14030v2","updated":"2023-07-20T13:54:05Z","published":"2023-06-24T18:17:38Z","title":"My Boli: Code-mixed Marathi-English Corpora, Pretrained Language Models\n and Evaluation Benchmarks","summary":" The research on code-mixed data is limited due to the unavailability of\ndedicated code-mixed datasets and pre-trained language models. In this work, we\nfocus on the low-resource Indian language Marathi which lacks any prior work in\ncode-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English\n(Mr-En) corpus with 10 million social media sentences for pretraining. We also\nrelease L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models\npre-trained on MeCorpus. Furthermore, for benchmarking, we present three\nsupervised datasets MeHate, MeSent, and MeLID for downstream tasks like\ncode-mixed Mr-En hate speech detection, sentiment analysis, and language\nidentification respectively. These evaluation datasets individually consist of\nmanually annotated \\url{~}12,000 Marathi-English code-mixed tweets. Ablations\nshow that the models trained on this novel corpus significantly outperform the\nexisting state-of-the-art BERT models. This is the first work that presents\nartifacts for code-mixed Marathi research. All datasets and models are publicly\nreleased at https://github.com/l3cube-pune/MarathiNLP .\n","authors":["Tanmay Chavan","Omkar Gokhale","Aditya Kane","Shantanu Patankar","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2306.14030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10875v1","updated":"2023-07-20T13:47:30Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":" The popularity of point cloud deep models for safety-critical purposes has\nincreased, but the reliability and security of these models can be compromised\nby intentional or naturally occurring point cloud noise. To combat this issue,\nwe present a novel point cloud outlier removal method called PointCVaR, which\nempowers standard-trained models to eliminate additional outliers and restore\nthe data. Our approach begins by conducting attribution analysis to determine\nthe influence of each point on the model output, which we refer to as point\nrisk. We then optimize the process of filtering high-risk points using\nConditional Value at Risk (CVaR) as the objective. The rationale for this\napproach is based on the observation that noise points in point clouds tend to\ncluster in the tail of the risk distribution, with a low frequency but a high\nlevel of risk, resulting in significant interference with classification\nresults. Despite requiring no additional training effort, our method produces\nexceptional results in various removal-and-classification experiments for noisy\npoint clouds, which are corrupted by random noise, adversarial noise, and\nbackdoor trigger noise. Impressively, it achieves 87% accuracy in defense\nagainst the backdoor attack by removing triggers. Overall, the proposed\nPointCVaR effectively eliminates noise points and enhances point cloud\nclassification, making it a promising plug-in module for various models in\ndifferent scenarios.\n","authors":["Xinke Li","Junchi Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10870v1","updated":"2023-07-20T13:42:13Z","published":"2023-07-20T13:42:13Z","title":"Nonlinear Meta-Learning Can Guarantee Faster Rates","summary":" Many recent theoretical works on \\emph{meta-learning} aim to achieve\nguarantees in leveraging similar representational structures from related tasks\ntowards simplifying a target task. Importantly, the main aim in theory works on\nthe subject is to understand the extent to which convergence rates -- in\nlearning a common representation -- \\emph{may scale with the number $N$ of\ntasks} (as well as the number of samples per task). First steps in this setting\ndemonstrate this property when both the shared representation amongst tasks,\nand task-specific regression functions, are linear. This linear setting readily\nreveals the benefits of aggregating tasks, e.g., via averaging arguments. In\npractice, however, the representation is often highly nonlinear, introducing\nnontrivial biases in each task that cannot easily be averaged out as in the\nlinear case. In the present work, we derive theoretical guarantees for\nmeta-learning with nonlinear representations. In particular, assuming the\nshared nonlinearity maps to an infinite-dimensional RKHS, we show that\nadditional biases can be mitigated with careful regularization that leverages\nthe smoothness of task-specific regression functions,\n","authors":["Dimitri Meunier","Zhu Li","Arthur Gretton","Samory Kpotufe"],"pdf_url":"https://arxiv.org/pdf/2307.10870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10869v1","updated":"2023-07-20T13:41:26Z","published":"2023-07-20T13:41:26Z","title":"Performance Issue Identification in Cloud Systems with\n Relational-Temporal Anomaly Detection","summary":" Performance issues permeate large-scale cloud service systems, which can lead\nto huge revenue losses. To ensure reliable performance, it's essential to\naccurately identify and localize these issues using service monitoring metrics.\nGiven the complexity and scale of modern cloud systems, this task can be\nchallenging and may require extensive expertise and resources beyond the\ncapacity of individual humans. Some existing methods tackle this problem by\nanalyzing each metric independently to detect anomalies. However, this could\nincur overwhelming alert storms that are difficult for engineers to diagnose\nmanually. To pursue better performance, not only the temporal patterns of\nmetrics but also the correlation between metrics (i.e., relational patterns)\nshould be considered, which can be formulated as a multivariate metrics anomaly\ndetection problem. However, most of the studies fall short of extracting these\ntwo types of features explicitly. Moreover, there exist some unlabeled\nanomalies mixed in the training data, which may hinder the detection\nperformance. To address these limitations, we propose the Relational- Temporal\nAnomaly Detection Model (RTAnomaly) that combines the relational and temporal\ninformation of metrics. RTAnomaly employs a graph attention layer to learn the\ndependencies among metrics, which will further help pinpoint the anomalous\nmetrics that may cause the anomaly effectively. In addition, we exploit the\nconcept of positive unlabeled learning to address the issue of potential\nanomalies in the training data. To evaluate our method, we conduct experiments\non a public dataset and two industrial datasets. RTAnomaly outperforms all the\nbaseline models by achieving an average F1 score of 0.929 and Hit@3 of 0.920,\ndemonstrating its superiority.\n","authors":["Wenwei Gu","Jinyang Liu","Zhuangbin Chen","Jianping Zhang","Yuxin Su","Jiazhen Gu","Cong Feng","Zengyin Yang","Michael Lyu"],"pdf_url":"https://arxiv.org/pdf/2307.10869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10867v1","updated":"2023-07-20T13:40:22Z","published":"2023-07-20T13:40:22Z","title":"FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with\n Human Feedback","summary":" Captions are crucial for understanding scientific visualizations and\ndocuments. Existing captioning methods for scientific figures rely on\nfigure-caption pairs extracted from documents for training, many of which fall\nshort with respect to metrics like helpfulness, explainability, and\nvisual-descriptiveness [15] leading to generated captions being misaligned with\nreader preferences. To enable the generation of high-quality figure captions,\nwe introduce FigCaps-HF a new framework for figure-caption generation that can\nincorporate domain expert feedback in generating captions optimized for reader\npreferences. Our framework comprises of 1) an automatic method for evaluating\nquality of figure-caption pairs, 2) a novel reinforcement learning with human\nfeedback (RLHF) method to optimize a generative figure-to-caption model for\nreader preferences. We demonstrate the effectiveness of our simple learning\nframework by improving performance over standard fine-tuning across different\ntypes of models. In particular, when using BLIP as the base model, our RLHF\nframework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and\nMeteor, respectively. Finally, we release a large-scale benchmark dataset with\nhuman feedback on figure-caption pairs to enable further evaluation and\ndevelopment of RLHF techniques for this problem.\n","authors":["Ashish Singh","Prateek Agarwal","Zixuan Huang","Arpita Singh","Tong Yu","Sungchul Kim","Victor Bursztyn","Nikos Vlassis","Ryan A. Rossi"],"pdf_url":"https://arxiv.org/pdf/2307.10867v1.pdf","comment":"19 pages, 4 figures. Benchmark Documentation:\n https://figcapshf.github.io/"},{"id":"http://arxiv.org/abs/2307.10865v1","updated":"2023-07-20T13:34:11Z","published":"2023-07-20T13:34:11Z","title":"Addressing caveats of neural persistence with deep graph persistence","summary":" Neural Persistence is a prominent measure for quantifying neural network\ncomplexity, proposed in the emerging field of topological data analysis in deep\nlearning. In this work, however, we find both theoretically and empirically\nthat the variance of network weights and spatial concentration of large weights\nare the main factors that impact neural persistence. Whilst this captures\nuseful information for linear classifiers, we find that no relevant spatial\nstructure is present in later layers of deep neural networks, making neural\npersistence roughly equivalent to the variance of weights. Additionally, the\nproposed averaging procedure across layers for deep neural networks does not\nconsider interaction between layers. Based on our analysis, we propose an\nextension of the filtration underlying neural persistence to the whole neural\nnetwork instead of single layers, which is equivalent to calculating neural\npersistence on one particular matrix. This yields our deep graph persistence\nmeasure, which implicitly incorporates persistent paths through the network and\nalleviates variance-related issues through standardisation. Code is available\nat https://github.com/ExplainableML/Deep-Graph-Persistence .\n","authors":["Leander Girrbach","Anders Christensen","Ole Winther","Zeynep Akata","A. Sophia Koepke"],"pdf_url":"https://arxiv.org/pdf/2307.10865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10864v1","updated":"2023-07-20T13:33:28Z","published":"2023-07-20T13:33:28Z","title":"Divide & Bind Your Attention for Improved Generative Semantic Nursing","summary":" Emerging large-scale text-to-image generative models, e.g., Stable Diffusion\n(SD), have exhibited overwhelming results with high fidelity. Despite the\nmagnificent progress, current state-of-the-art models still struggle to\ngenerate images fully adhering to the input prompt. Prior work, Attend &\nExcite, has introduced the concept of Generative Semantic Nursing (GSN), aiming\nto optimize cross-attention during inference time to better incorporate the\nsemantics. It demonstrates promising results in generating simple prompts,\ne.g., ``a cat and a dog''. However, its efficacy declines when dealing with\nmore complex prompts, and it does not explicitly address the problem of\nimproper attribute binding. To address the challenges posed by complex prompts\nor scenarios involving multiple entities and to achieve improved attribute\nbinding, we propose Divide & Bind. We introduce two novel loss objectives for\nGSN: a novel attendance loss and a binding loss. Our approach stands out in its\nability to faithfully synthesize desired objects with improved attribute\nalignment from complex prompts and exhibits superior performance across\nmultiple evaluation benchmarks. More videos and updates can be found on the\nproject page \\url{https://sites.google.com/view/divide-and-bind}.\n","authors":["Yumeng Li","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2307.10864v1.pdf","comment":"Project page: \\url{https://sites.google.com/view/divide-and-bind}"},{"id":"http://arxiv.org/abs/2307.09206v2","updated":"2023-07-20T13:29:27Z","published":"2023-07-18T12:42:59Z","title":"Context-Conditional Navigation with a Learning-Based Terrain- and\n Robot-Aware Dynamics Model","summary":" In autonomous navigation settings, several quantities can be subject to\nvariations. Terrain properties such as friction coefficients may vary over time\ndepending on the location of the robot. Also, the dynamics of the robot may\nchange due to, e.g., different payloads, changing the system's mass, or wear\nand tear, changing actuator gains or joint friction. An autonomous agent should\nthus be able to adapt to such variations. In this paper, we develop a novel\nprobabilistic, terrain- and robot-aware forward dynamics model, termed TRADYN,\nwhich is able to adapt to the above-mentioned variations. It builds on recent\nadvances in meta-learning forward dynamics models based on Neural Processes. We\nevaluate our method in a simulated 2D navigation setting with a unicycle-like\nrobot and different terrain layouts with spatially varying friction\ncoefficients. In our experiments, the proposed model exhibits lower prediction\nerror for the task of long-horizon trajectory prediction, compared to\nnon-adaptive ablation models. We also evaluate our model on the downstream task\nof navigation planning, which demonstrates improved performance in planning\ncontrol-efficient paths by taking robot and terrain properties into account.\n","authors":["Suresh Guttikonda","Jan Achterhold","Haolong Li","Joschka Boedecker","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2307.09206v2.pdf","comment":"\\copyright 2023 IEEE. Accepted for publication in European Conference\n on Mobile Robots (ECMR), 2023. Updated copyright statement"},{"id":"http://arxiv.org/abs/2211.04974v2","updated":"2023-07-20T13:11:13Z","published":"2022-11-09T15:39:32Z","title":"Leveraging Offline Data in Online Reinforcement Learning","summary":" Two central paradigms have emerged in the reinforcement learning (RL)\ncommunity: online RL and offline RL. In the online RL setting, the agent has no\nprior knowledge of the environment, and must interact with it in order to find\nan $\\epsilon$-optimal policy. In the offline RL setting, the learner instead\nhas access to a fixed dataset to learn from, but is unable to otherwise\ninteract with the environment, and must obtain the best policy it can from this\noffline data. Practical scenarios often motivate an intermediate setting: if we\nhave some set of offline data and, in addition, may also interact with the\nenvironment, how can we best use the offline data to minimize the number of\nonline interactions necessary to learn an $\\epsilon$-optimal policy?\n In this work, we consider this setting, which we call the \\textsf{FineTuneRL}\nsetting, for MDPs with linear structure. We characterize the necessary number\nof online samples needed in this setting given access to some offline dataset,\nand develop an algorithm, \\textsc{FTPedel}, which is provably optimal, up to\n$H$ factors. We show through an explicit example that combining offline data\nwith online interactions can lead to a provable improvement over either purely\noffline or purely online RL. Finally, our results illustrate the distinction\nbetween \\emph{verifiable} learning, the typical setting considered in online\nRL, and \\emph{unverifiable} learning, the setting often considered in offline\nRL, and show that there is a formal separation between these regimes.\n","authors":["Andrew Wagenmaker","Aldo Pacchiano"],"pdf_url":"https://arxiv.org/pdf/2211.04974v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10845v1","updated":"2023-07-20T13:07:41Z","published":"2023-07-20T13:07:41Z","title":"Self-paced Weight Consolidation for Continual Learning","summary":" Continual learning algorithms which keep the parameters of new tasks close to\nthat of previous tasks, are popular in preventing catastrophic forgetting in\nsequential task learning settings. However, 1) the performance for the new\ncontinual learner will be degraded without distinguishing the contributions of\npreviously learned tasks; 2) the computational cost will be greatly increased\nwith the number of tasks, since most existing algorithms need to regularize all\nprevious tasks when learning new tasks. To address the above challenges, we\npropose a self-paced Weight Consolidation (spWC) framework to attain robust\ncontinual learning via evaluating the discriminative contributions of previous\ntasks. To be specific, we develop a self-paced regularization to reflect the\npriorities of past tasks via measuring difficulty based on key performance\nindicator (i.e., accuracy). When encountering a new task, all previous tasks\nare sorted from \"difficult\" to \"easy\" based on the priorities. Then the\nparameters of the new continual learner will be learned via selectively\nmaintaining the knowledge amongst more difficult past tasks, which could well\novercome catastrophic forgetting with less computational cost. We adopt an\nalternative convex search to iteratively update the model parameters and\npriority weights in the bi-convex formulation. The proposed spWC framework is\nplug-and-play, which is applicable to most continual learning algorithms (e.g.,\nEWC, MAS and RCIL) in different directions (e.g., classification and\nsegmentation). Experimental results on several public benchmark datasets\ndemonstrate that our proposed framework can effectively improve performance\nwhen compared with other popular continual learning algorithms.\n","authors":["Wei Cong","Yang Cong","Gan Sun","Yuyang Liu","Jiahua Dong"],"pdf_url":"https://arxiv.org/pdf/2307.10845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10843v1","updated":"2023-07-20T13:04:26Z","published":"2023-07-20T13:04:26Z","title":"Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals\n for GPM: A U-Net Convolutional LSTM Architecture","summary":" This paper presents a deep learning architecture for nowcasting of\nprecipitation almost globally every 30 min with a 4-hour lead time. The\narchitecture fuses a U-Net and a convolutional long short-term memory (LSTM)\nneural network and is trained using data from the Integrated MultisatellitE\nRetrievals for GPM (IMERG) and a few key precipitation drivers from the Global\nForecast System (GFS). The impacts of different training loss functions,\nincluding the mean-squared error (regression) and the focal-loss\n(classification), on the quality of precipitation nowcasts are studied. The\nresults indicate that the regression network performs well in capturing light\nprecipitation (below 1.6 mm/hr), but the classification network can outperform\nthe regression network for nowcasting of precipitation extremes (>8 mm/hr), in\nterms of the critical success index (CSI).. Using the Wasserstein distance, it\nis shown that the predicted precipitation by the classification network has a\ncloser class probability distribution to the IMERG than the regression network.\nIt is uncovered that the inclusion of the physical variables can improve\nprecipitation nowcasting, especially at longer lead times in both networks.\nTaking IMERG as a relative reference, a multi-scale analysis in terms of\nfractions skill score (FSS), shows that the nowcasting machine remains skillful\n(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For\nprecipitation rates greater than 4~mm/hr, only the classification network\nremains FSS-skillful on scales greater than 50 km within a 2-hour lead time.\n","authors":["Reyhaneh Rahimi","Ardeshir Ebtehaj","Ali Behrangi","Jackson Tan"],"pdf_url":"https://arxiv.org/pdf/2307.10843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10842v1","updated":"2023-07-20T13:02:45Z","published":"2023-07-20T13:02:45Z","title":"Label Calibration for Semantic Segmentation Under Domain Shift","summary":" Performance of a pre-trained semantic segmentation model is likely to\nsubstantially decrease on data from a new domain. We show a pre-trained model\ncan be adapted to unlabelled target domain data by calculating soft-label\nprototypes under the domain shift and making predictions according to the\nprototype closest to the vector with predicted class probabilities. The\nproposed adaptation procedure is fast, comes almost for free in terms of\ncomputational resources and leads to considerable performance improvements. We\ndemonstrate the benefits of such label calibration on the highly-practical\nsynthetic-to-real semantic segmentation problem.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10842v1.pdf","comment":"ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for\n Trustworthy ML"},{"id":"http://arxiv.org/abs/2207.02575v2","updated":"2023-07-20T12:59:44Z","published":"2022-07-06T10:42:57Z","title":"Instance-Dependent Near-Optimal Policy Identification in Linear MDPs via\n Online Experiment Design","summary":" While much progress has been made in understanding the minimax sample\ncomplexity of reinforcement learning (RL) -- the complexity of learning on the\n\"worst-case\" instance -- such measures of complexity often do not capture the\ntrue difficulty of learning. In practice, on an \"easy\" instance, we might hope\nto achieve a complexity far better than that achievable on the worst-case\ninstance. In this work we seek to understand the \"instance-dependent\"\ncomplexity of learning near-optimal policies (PAC RL) in the setting of RL with\nlinear function approximation. We propose an algorithm, \\textsc{Pedel}, which\nachieves a fine-grained instance-dependent measure of complexity, the first of\nits kind in the RL with function approximation setting, thereby capturing the\ndifficulty of learning on each particular problem instance. Through an explicit\nexample, we show that \\textsc{Pedel} yields provable gains over low-regret,\nminimax-optimal algorithms and that such algorithms are unable to hit the\ninstance-optimal rate. Our approach relies on a novel online experiment\ndesign-based procedure which focuses the exploration budget on the \"directions\"\nmost relevant to learning a near-optimal policy, and may be of independent\ninterest.\n","authors":["Andrew Wagenmaker","Kevin Jamieson"],"pdf_url":"https://arxiv.org/pdf/2207.02575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06092v2","updated":"2023-07-20T12:54:32Z","published":"2023-07-12T11:35:37Z","title":"Quantitative CLTs in Deep Neural Networks","summary":" We study the distribution of a fully connected neural network with random\nGaussian weights and biases in which the hidden layer widths are proportional\nto a large constant $n$. Under mild assumptions on the non-linearity, we obtain\nquantitative bounds on normal approximations valid at large but finite $n$ and\nany fixed network depth. Our theorems show both for the finite-dimensional\ndistributions and the entire process, that the distance between a random fully\nconnected network (and its derivatives) to the corresponding infinite width\nGaussian process scales like $n^{-\\gamma}$ for $\\gamma>0$, with the exponent\ndepending on the metric used to measure discrepancy. Our bounds are strictly\nstronger in terms of their dependence on network width than any previously\navailable in the literature; in the one-dimensional case, we also prove that\nthey are optimal, i.e., we establish matching lower bounds.\n","authors":["Stefano Favaro","Boris Hanin","Domenico Marinucci","Ivan Nourdin","Giovanni Peccati"],"pdf_url":"https://arxiv.org/pdf/2307.06092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10810v1","updated":"2023-07-20T12:20:18Z","published":"2023-07-20T12:20:18Z","title":"On Combining Expert Demonstrations in Imitation Learning via Optimal\n Transport","summary":" Imitation learning (IL) seeks to teach agents specific tasks through expert\ndemonstrations. One of the key approaches to IL is to define a distance between\nagent and expert and to find an agent policy that minimizes that distance.\nOptimal transport methods have been widely used in imitation learning as they\nprovide ways to measure meaningful distances between agent and expert\ntrajectories. However, the problem of how to optimally combine multiple expert\ndemonstrations has not been widely studied. The standard method is to simply\nconcatenate state (-action) trajectories, which is problematic when\ntrajectories are multi-modal. We propose an alternative method that uses a\nmulti-marginal optimal transport distance and enables the combination of\nmultiple and diverse state-trajectories in the OT sense, providing a more\nsensible geometric average of the demonstrations. Our approach enables an agent\nto learn from several experts, and its efficiency is analyzed on OpenAI Gym\ncontrol environments and demonstrates that the standard method is not always\noptimal.\n","authors":["Ilana Sebag","Samuel Cohen","Marc Peter Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2307.10810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15382v2","updated":"2023-07-20T12:18:49Z","published":"2022-11-24T13:21:36Z","title":"Neural Network Complexity of Chaos and Turbulence","summary":" Chaos and turbulence are complex physical phenomena, yet a precise definition\nof the complexity measure that quantifies them is still lacking. In this work\nwe consider the relative complexity of chaos and turbulence from the\nperspective of deep neural networks. We analyze a set of classification\nproblems, where the network has to distinguish images of fluid profiles in the\nturbulent regime from other classes of images such as fluid profiles in the\nchaotic regime, various constructions of noise and real world images. We\nanalyze incompressible as well as weakly compressible fluid flows. We quantify\nthe complexity of the computation performed by the network via the intrinsic\ndimensionality of the internal feature representations, and calculate the\neffective number of independent features which the network uses in order to\ndistinguish between classes. In addition to providing a numerical estimate of\nthe complexity of the computation, the measure also characterizes the neural\nnetwork processing at intermediate and final stages. We construct adversarial\nexamples and use them to identify the two point correlation spectra for the\nchaotic and turbulent vorticity as the feature used by the network for\nclassification.\n","authors":["Tim Whittaker","Romuald A. Janik","Yaron Oz"],"pdf_url":"https://arxiv.org/pdf/2211.15382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.01001v3","updated":"2023-07-20T12:17:50Z","published":"2021-06-02T07:53:54Z","title":"Warming up recurrent neural networks to maximise reachable\n multistability greatly improves learning","summary":" Training recurrent neural networks is known to be difficult when time\ndependencies become long. In this work, we show that most standard cells only\nhave one stable equilibrium at initialisation, and that learning on tasks with\nlong time dependencies generally occurs once the number of network stable\nequilibria increases; a property known as multistability. Multistability is\noften not easily attained by initially monostable networks, making learning of\nlong time dependencies between inputs and outputs difficult. This insight leads\nto the design of a novel way to initialise any recurrent cell connectivity\nthrough a procedure called \"warmup\" to improve its capability to learn\narbitrarily long time dependencies. This initialisation procedure is designed\nto maximise network reachable multistability, i.e., the number of equilibria\nwithin the network that can be reached through relevant input trajectories, in\nfew gradient steps. We show on several information restitution, sequence\nclassification, and reinforcement learning benchmarks that warming up greatly\nimproves learning speed and performance, for multiple recurrent cells, but\nsometimes impedes precision. We therefore introduce a double-layer architecture\ninitialised with a partial warmup that is shown to greatly improve learning of\nlong time dependencies while maintaining high levels of precision. This\napproach provides a general framework for improving learning abilities of any\nrecurrent cell when long time dependencies are present. We also show\nempirically that other initialisation and pretraining procedures from the\nliterature implicitly foster reachable multistability of recurrent cells.\n","authors":["Gaspard Lambrechts","Florent De Geeter","Nicolas Vecoven","Damien Ernst","Guillaume Drion"],"pdf_url":"https://arxiv.org/pdf/2106.01001v3.pdf","comment":"20 pages, 35 pages total, 38 figures"},{"id":"http://arxiv.org/abs/2307.10805v1","updated":"2023-07-20T12:16:26Z","published":"2023-07-20T12:16:26Z","title":"Communication-Efficient Split Learning via Adaptive Feature-Wise\n Compression","summary":" This paper proposes a novel communication-efficient split learning (SL)\nframework, named SplitFC, which reduces the communication overhead required for\ntransmitting intermediate feature and gradient vectors during the SL training\nprocess. The key idea of SplitFC is to leverage different dispersion degrees\nexhibited in the columns of the matrices. SplitFC incorporates two compression\nstrategies: (i) adaptive feature-wise dropout and (ii) adaptive feature-wise\nquantization. In the first strategy, the intermediate feature vectors are\ndropped with adaptive dropout probabilities determined based on the standard\ndeviation of these vectors. Then, by the chain rule, the intermediate gradient\nvectors associated with the dropped feature vectors are also dropped. In the\nsecond strategy, the non-dropped intermediate feature and gradient vectors are\nquantized using adaptive quantization levels determined based on the ranges of\nthe vectors. To minimize the quantization error, the optimal quantization\nlevels of this strategy are derived in a closed-form expression. Simulation\nresults on the MNIST, CIFAR-10, and CelebA datasets demonstrate that SplitFC\nprovides more than a 5.6% increase in classification accuracy compared to\nstate-of-the-art SL frameworks, while they require 320 times less communication\noverhead compared to the vanilla SL framework without compression.\n","authors":["Yongjeong Oh","Jaeho Lee","Christopher G. Brinton","Yo-Seb Jeon"],"pdf_url":"https://arxiv.org/pdf/2307.10805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10803v1","updated":"2023-07-20T12:12:05Z","published":"2023-07-20T12:12:05Z","title":"Spatial-Temporal Data Mining for Ocean Science: Data, Methodologies, and\n Opportunities","summary":" With the increasing amount of spatial-temporal~(ST) ocean data, numerous\nspatial-temporal data mining (STDM) studies have been conducted to address\nvarious oceanic issues, e.g., climate forecasting and disaster warning.\nCompared with typical ST data (e.g., traffic data), ST ocean data is more\ncomplicated with some unique characteristics, e.g., diverse regionality and\nhigh sparsity. These characteristics make it difficult to design and train STDM\nmodels. Unfortunately, an overview of these studies is still missing, hindering\ncomputer scientists to identify the research issues in ocean while discouraging\nresearchers in ocean science from applying advanced STDM techniques. To remedy\nthis situation, we provide a comprehensive survey to summarize existing STDM\nstudies in ocean. Concretely, we first summarize the widely-used ST ocean\ndatasets and identify their unique characteristics. Then, typical ST ocean data\nquality enhancement techniques are discussed. Next, we classify existing STDM\nstudies for ocean into four types of tasks, i.e., prediction, event detection,\npattern mining, and anomaly detection, and elaborate the techniques for these\ntasks. Finally, promising research opportunities are highlighted. This survey\nwill help scientists from the fields of both computer science and ocean science\nhave a better understanding of the fundamental concepts, key techniques, and\nopen challenges of STDM in ocean.\n","authors":["Hanchen Yang","Wengen Li","Shuyu Wang","Hui Li","Jihong Guan","Shuigeng Zhou","Jiannong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.10803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.03259v2","updated":"2023-07-20T12:11:56Z","published":"2020-09-07T17:27:27Z","title":"Implicit Multidimensional Projection of Local Subspaces","summary":" We propose a visualization method to understand the effect of\nmultidimensional projection on local subspaces, using implicit function\ndifferentiation. Here, we understand the local subspace as the multidimensional\nlocal neighborhood of data points. Existing methods focus on the projection of\nmultidimensional data points, and the neighborhood information is ignored. Our\nmethod is able to analyze the shape and directional information of the local\nsubspace to gain more insights into the global structure of the data through\nthe perception of local structures. Local subspaces are fitted by\nmultidimensional ellipses that are spanned by basis vectors. An accurate and\nefficient vector transformation method is proposed based on analytical\ndifferentiation of multidimensional projections formulated as implicit\nfunctions. The results are visualized as glyphs and analyzed using a full set\nof specifically-designed interactions supported in our efficient web-based\nvisualization tool. The usefulness of our method is demonstrated using various\nmulti- and high-dimensional benchmark datasets. Our implicit differentiation\nvector transformation is evaluated through numerical comparisons; the overall\nmethod is evaluated through exploration examples and use cases.\n","authors":["Rongzheng Bian","Yumeng Xue","Liang Zhou","Jian Zhang","Baoquan Chen","Daniel Weiskopf","Yunhai Wang"],"pdf_url":"https://arxiv.org/pdf/2009.03259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":" Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2205.12900v4","updated":"2023-07-20T12:10:09Z","published":"2022-05-25T16:46:01Z","title":"Pre-trained Perceptual Features Improve Differentially Private Image\n Generation","summary":" Training even moderately-sized generative models with differentially-private\nstochastic gradient descent (DP-SGD) is difficult: the required level of noise\nfor reasonable levels of privacy is simply too large. We advocate instead\nbuilding off a good, relevant representation on an informative public dataset,\nthen learning to model the private data with that representation. In\nparticular, we minimize the maximum mean discrepancy (MMD) between private\ntarget data and a generator's distribution, using a kernel based on perceptual\nfeatures learned from a public dataset. With the MMD, we can simply privatize\nthe data-dependent term once and for all, rather than introducing noise at each\nstep of optimization as in DP-SGD. Our algorithm allows us to generate\nCIFAR10-level images with $\\epsilon \\approx 2$ which capture distinctive\nfeatures in the distribution, far surpassing the current state of the art,\nwhich mostly focuses on datasets such as MNIST and FashionMNIST at a large\n$\\epsilon \\approx 10$. Our work introduces simple yet powerful foundations for\nreducing the gap between private and non-private deep generative models. Our\ncode is available at \\url{https://github.com/ParkLabML/DP-MEPF}.\n","authors":["Fredrik Harder","Milad Jalali Asadabadi","Danica J. Sutherland","Mijung Park"],"pdf_url":"https://arxiv.org/pdf/2205.12900v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.03455v2","updated":"2023-07-20T11:54:22Z","published":"2021-07-07T19:35:31Z","title":"Model Selection for Generic Contextual Bandits","summary":" We consider the problem of model selection for the general stochastic\ncontextual bandits under the realizability assumption. We propose a successive\nrefinement based algorithm called Adaptive Contextual Bandit ({\\ttfamily ACB}),\nthat works in phases and successively eliminates model classes that are too\nsimple to fit the given instance. We prove that this algorithm is adaptive,\ni.e., the regret rate order-wise matches that of any provable contextual bandit\nalgorithm (ex. \\cite{falcon}), that needs the knowledge of the true model\nclass. The price of not knowing the correct model class turns out to be only an\nadditive term contributing to the second order term in the regret bound. This\ncost possess the intuitive property that it becomes smaller as the model class\nbecomes easier to identify, and vice-versa. We also show that a much simpler\nexplore-then-commit (ETC) style algorithm also obtains similar regret bound,\ndespite not knowing the true model class. However, the cost of model selection\nis higher in ETC as opposed to in {\\ttfamily ACB}, as expected. Furthermore,\nfor the special case of linear contextual bandits, we propose specialized\nalgorithms that obtain sharper guarantees compared to the generic setup.\n","authors":["Avishek Ghosh","Abishek Sankararaman","Kannan Ramchandran"],"pdf_url":"https://arxiv.org/pdf/2107.03455v2.pdf","comment":"Accepted at IEEE Transactions on Information Theory. arXiv admin\n note: text overlap with arXiv:2006.02612"},{"id":"http://arxiv.org/abs/2307.10792v1","updated":"2023-07-20T11:45:38Z","published":"2023-07-20T11:45:38Z","title":"Optimizing PatchCore for Few/many-shot Anomaly Detection","summary":" Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and\ntries to distinguish between normal and anomalous data using only few selected\nsamples. While newly proposed few-shot AD methods do compare against\npre-existing algorithms developed for the full-shot domain as baselines, they\ndo not dedicatedly optimize them for the few-shot setting. It thus remains\nunclear if the performance of such pre-existing algorithms can be further\nimproved. We address said question in this work. Specifically, we present a\nstudy on the AD/anomaly segmentation (AS) performance of PatchCore, the current\nstate-of-the-art full-shot AD/AS algorithm, in both the few-shot and the\nmany-shot settings. We hypothesize that further performance improvements can be\nrealized by (I) optimizing its various hyperparameters, and by (II)\ntransferring techniques known to improve few-shot supervised learning to the AD\ndomain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal\nthat (I) significant performance improvements can be realized by optimizing\nhyperparameters such as the underlying feature extractor, and that (II)\nimage-level augmentations can, but are not guaranteed, to improve performance.\nBased on these findings, we achieve a new state of the art in few-shot AD on\nVisA, further demonstrating the merit of adapting pre-existing AD/AS methods to\nthe few-shot setting. Last, we identify the investigation of feature extractors\nwith a strong inductive bias as a potential future research direction for\n(few-shot) AD/AS.\n","authors":["João Santos","Triet Tran","Oliver Rippel"],"pdf_url":"https://arxiv.org/pdf/2307.10792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10788v1","updated":"2023-07-20T11:38:55Z","published":"2023-07-20T11:38:55Z","title":"Adversarial attacks for mixtures of classifiers","summary":" Mixtures of classifiers (a.k.a. randomized ensembles) have been proposed as a\nway to improve robustness against adversarial attacks. However, it has been\nshown that existing attacks are not well suited for this kind of classifiers.\nIn this paper, we discuss the problem of attacking a mixture in a principled\nway and introduce two desirable properties of attacks based on a geometrical\nanalysis of the problem (effectiveness and maximality). We then show that\nexisting attacks do not meet both of these properties. Finally, we introduce a\nnew attack called lattice climber attack with theoretical guarantees on the\nbinary linear setting, and we demonstrate its performance by conducting\nexperiments on synthetic and real datasets.\n","authors":["Lucas Gnecco Heredia","Benjamin Negrevergne","Yann Chevaleyre"],"pdf_url":"https://arxiv.org/pdf/2307.10788v1.pdf","comment":"7 pages + 4 pages of appendix. 5 figures in main text"},{"id":"http://arxiv.org/abs/2307.09614v2","updated":"2023-07-20T11:36:52Z","published":"2023-07-13T19:03:06Z","title":"Multi-view self-supervised learning for multivariate variable-channel\n time series","summary":" Labeling of multivariate biomedical time series data is a laborious and\nexpensive process. Self-supervised contrastive learning alleviates the need for\nlarge, labeled datasets through pretraining on unlabeled data. However, for\nmultivariate time series data, the set of input channels often varies between\napplications, and most existing work does not allow for transfer between\ndatasets with different sets of input channels. We propose learning one encoder\nto operate on all input channels individually. We then use a message passing\nneural network to extract a single representation across channels. We\ndemonstrate the potential of this method by pretraining our model on a dataset\nwith six EEG channels and then fine-tuning it on a dataset with two different\nEEG channels. We compare models with and without the message passing neural\nnetwork across different contrastive loss functions. We show that our method,\ncombined with the TS2Vec loss, outperforms all other methods in most settings.\n","authors":["Thea Brüsch","Mikkel N. Schmidt","Tommy S. Alstrøm"],"pdf_url":"https://arxiv.org/pdf/2307.09614v2.pdf","comment":"To appear in proceedings of 2023 IEEE International workshop on\n Machine Learning for Signal Processing"},{"id":"http://arxiv.org/abs/2307.10787v1","updated":"2023-07-20T11:36:45Z","published":"2023-07-20T11:36:45Z","title":"Feed-Forward Source-Free Domain Adaptation via Class Prototypes","summary":" Source-free domain adaptation has become popular because of its practical\nusefulness and no need to access source data. However, the adaptation process\nstill takes a considerable amount of time and is predominantly based on\noptimization that relies on back-propagation. In this work we present a simple\nfeed-forward approach that challenges the need for back-propagation based\nadaptation. Our approach is based on computing prototypes of classes under the\ndomain shift using a pre-trained model. It achieves strong improvements in\naccuracy compared to the pre-trained model and requires only a small fraction\nof time of existing domain adaptation methods.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10787v1.pdf","comment":"ECCV 2022 Workshop on Out of Distribution Generalization in Computer\n Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2307.10779v1","updated":"2023-07-20T11:29:17Z","published":"2023-07-20T11:29:17Z","title":"Efficient Beam Tree Recursion","summary":" Beam Tree Recursive Neural Network (BT-RvNN) was recently proposed as a\nsimple extension of Gumbel Tree RvNN and it was shown to achieve\nstate-of-the-art length generalization performance in ListOps while maintaining\ncomparable performance on other tasks. However, although not the worst in its\nkind, BT-RvNN can be still exorbitantly expensive in memory usage. In this\npaper, we identify the main bottleneck in BT-RvNN's memory usage to be the\nentanglement of the scorer function and the recursive cell function. We propose\nstrategies to remove this bottleneck and further simplify its memory usage.\nOverall, our strategies not only reduce the memory usage of BT-RvNN by\n$10$-$16$ times but also create a new state-of-the-art in ListOps while\nmaintaining similar performance in other tasks. In addition, we also propose a\nstrategy to utilize the induced latent-tree node representations produced by\nBT-RvNN to turn BT-RvNN from a sentence encoder of the form $f:\\mathbb{R}^{n\n\\times d} \\rightarrow \\mathbb{R}^{d}$ into a sequence contextualizer of the\nform $f:\\mathbb{R}^{n \\times d} \\rightarrow \\mathbb{R}^{n \\times d}$. Thus, our\nproposals not only open up a path for further scalability of RvNNs but also\nstandardize a way to use BT-RvNNs as another building block in the deep\nlearning toolkit that can be easily stacked or interfaced with other popular\nmodels such as Transformers and Structured State Space models.\n","authors":["Jishnu Ray Chowdhury","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2307.10779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10774v1","updated":"2023-07-20T11:14:24Z","published":"2023-07-20T11:14:24Z","title":"Assessing the Use of AutoML for Data-Driven Software Engineering","summary":" Background. Due to the widespread adoption of Artificial Intelligence (AI)\nand Machine Learning (ML) for building software applications, companies are\nstruggling to recruit employees with a deep understanding of such technologies.\nIn this scenario, AutoML is soaring as a promising solution to fill the AI/ML\nskills gap since it promises to automate the building of end-to-end AI/ML\npipelines that would normally be engineered by specialized team members. Aims.\nDespite the growing interest and high expectations, there is a dearth of\ninformation about the extent to which AutoML is currently adopted by teams\ndeveloping AI/ML-enabled systems and how it is perceived by practitioners and\nresearchers. Method. To fill these gaps, in this paper, we present a\nmixed-method study comprising a benchmark of 12 end-to-end AutoML tools on two\nSE datasets and a user survey with follow-up interviews to further our\nunderstanding of AutoML adoption and perception. Results. We found that AutoML\nsolutions can generate models that outperform those trained and optimized by\nresearchers to perform classification tasks in the SE domain. Also, our\nfindings show that the currently available AutoML solutions do not live up to\ntheir names as they do not equally support automation across the stages of the\nML development workflow and for all the team members. Conclusions. We derive\ninsights to inform the SE research community on how AutoML can facilitate their\nactivities and tool builders on how to design the next generation of AutoML\ntechnologies.\n","authors":["Fabio Calefato","Luigi Quaranta","Filippo Lanubile","Marcos Kalinowski"],"pdf_url":"https://arxiv.org/pdf/2307.10774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10773v1","updated":"2023-07-20T11:10:06Z","published":"2023-07-20T11:10:06Z","title":"Music Genre Classification with ResNet and Bi-GRU Using Visual\n Spectrograms","summary":" Music recommendation systems have emerged as a vital component to enhance\nuser experience and satisfaction for the music streaming services, which\ndominates music consumption. The key challenge in improving these recommender\nsystems lies in comprehending the complexity of music data, specifically for\nthe underpinning music genre classification. The limitations of manual genre\nclassification have highlighted the need for a more advanced system, namely the\nAutomatic Music Genre Classification (AMGC) system. While traditional machine\nlearning techniques have shown potential in genre classification, they heavily\nrely on manually engineered features and feature selection, failing to capture\nthe full complexity of music data. On the other hand, deep learning\nclassification architectures like the traditional Convolutional Neural Networks\n(CNN) are effective in capturing the spatial hierarchies but struggle to\ncapture the temporal dynamics inherent in music data. To address these\nchallenges, this study proposes a novel approach using visual spectrograms as\ninput, and propose a hybrid model that combines the strength of the Residual\nneural Network (ResNet) and the Gated Recurrent Unit (GRU). This model is\ndesigned to provide a more comprehensive analysis of music data, offering the\npotential to improve the music recommender systems through achieving a more\ncomprehensive analysis of music data and hence potentially more accurate genre\nclassification.\n","authors":["Junfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10768v1","updated":"2023-07-20T10:57:02Z","published":"2023-07-20T10:57:02Z","title":"Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of\n Working Memory","summary":" Working memory (WM), a fundamental cognitive process facilitating the\ntemporary storage, integration, manipulation, and retrieval of information,\nplays a vital role in reasoning and decision-making tasks. Robust benchmark\ndatasets that capture the multifaceted nature of WM are crucial for the\neffective development and evaluation of AI WM models. Here, we introduce a\ncomprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM\ncomprises 10 tasks and a total of 1 million trials, assessing 4\nfunctionalities, 3 domains, and 11 behavioral and neural characteristics of WM.\nWe jointly trained and tested state-of-the-art recurrent neural networks and\ntransformers on all these tasks. We also include human behavioral benchmarks as\nan upper bound for comparison. Our results suggest that AI models replicate\nsome characteristics of WM in the brain, most notably primacy and recency\neffects, and neural clusters and correlates specialized for different domains\nand functionalities of WM. In the experiments, we also reveal some limitations\nin existing models to approximate human behavior. This dataset serves as a\nvaluable resource for communities in cognitive psychology, neuroscience, and\nAI, offering a standardized framework to compare and enhance WM models,\ninvestigate WM's neural underpinnings, and develop WM models with human-like\ncapabilities. Our source code and data are available at\nhttps://github.com/ZhangLab-DeepNeuroCogLab/WorM.\n","authors":["Ankur Sikarwar","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10763v1","updated":"2023-07-20T10:53:12Z","published":"2023-07-20T10:53:12Z","title":"MSQNet: Actor-agnostic Action Recognition with Multi-modal Query","summary":" Existing action recognition methods are typically actor-specific due to the\nintrinsic topological and apparent differences among the actors. This requires\nactor-specific pose estimation (e.g., humans vs. animals), leading to\ncumbersome model design complexity and high maintenance costs. Moreover, they\noften focus on learning the visual modality alone and single-label\nclassification whilst neglecting other available information sources (e.g.,\nclass name text) and the concurrent occurrence of multiple actions. To overcome\nthese limitations, we propose a new approach called 'actor-agnostic multi-modal\nmulti-label action recognition,' which offers a unified solution for various\ntypes of actors, including humans and animals. We further formulate a novel\nMulti-modal Semantic Query Network (MSQNet) model in a transformer-based object\ndetection framework (e.g., DETR), characterized by leveraging visual and\ntextual modalities to represent the action classes better. The elimination of\nactor-specific model designs is a key advantage, as it removes the need for\nactor pose estimation altogether. Extensive experiments on five publicly\navailable benchmarks show that our MSQNet consistently outperforms the prior\narts of actor-specific alternatives on human and animal single- and multi-label\naction recognition tasks by up to 50%. Code will be released at\nhttps://github.com/mondalanindya/MSQNet.\n","authors":["Anindya Mondal","Sauradip Nag","Joaquin M Prada","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13960v2","updated":"2023-07-20T10:26:56Z","published":"2023-06-24T13:29:54Z","title":"Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis","summary":" Regular group convolutional neural networks (G-CNNs) have been shown to\nincrease model performance and improve equivariance to different geometrical\nsymmetries. This work addresses the problem of SE(3), i.e., roto-translation\nequivariance, on volumetric data. Volumetric image data is prevalent in many\nmedical settings. Motivated by the recent work on separable group convolutions,\nwe devise a SE(3) group convolution kernel separated into a continuous SO(3)\n(rotation) kernel and a spatial kernel. We approximate equivariance to the\ncontinuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel\nis parameterized via RBF interpolation on similarly uniform grids. We\ndemonstrate the advantages of our approach in volumetric medical image\nanalysis. Our SE(3) equivariant models consistently outperform CNNs and regular\ndiscrete G-CNNs on challenging medical classification tasks and show\nsignificantly improved generalization capabilities. Our approach achieves up to\na 16.5% gain in accuracy over regular CNNs.\n","authors":["Thijs P. Kuipers","Erik J. Bekkers"],"pdf_url":"https://arxiv.org/pdf/2306.13960v2.pdf","comment":"10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated\n version to camera ready version 1"},{"id":"http://arxiv.org/abs/2307.10749v1","updated":"2023-07-20T10:24:18Z","published":"2023-07-20T10:24:18Z","title":"Mitigating Voter Attribute Bias for Fair Opinion Aggregation","summary":" The aggregation of multiple opinions plays a crucial role in decision-making,\nsuch as in hiring and loan review, and in labeling data for supervised\nlearning. Although majority voting and existing opinion aggregation models are\neffective for simple tasks, they are inappropriate for tasks without\nobjectively true labels in which disagreements may occur. In particular, when\nvoter attributes such as gender or race introduce bias into opinions, the\naggregation results may vary depending on the composition of voter attributes.\nA balanced group of voters is desirable for fair aggregation results but may be\ndifficult to prepare. In this study, we consider methods to achieve fair\nopinion aggregation based on voter attributes and evaluate the fairness of the\naggregated results. To this end, we consider an approach that combines opinion\naggregation models such as majority voting and the Dawid and Skene model (D&S\nmodel) with fairness options such as sample weighting. To evaluate the fairness\nof opinion aggregation, probabilistic soft labels are preferred over discrete\nclass labels. First, we address the problem of soft label estimation without\nconsidering voter attributes and identify some issues with the D&S model. To\naddress these limitations, we propose a new Soft D&S model with improved\naccuracy in estimating soft labels. Moreover, we evaluated the fairness of an\nopinion aggregation model, including Soft D&S, in combination with different\nfairness options using synthetic and semi-synthetic data. The experimental\nresults suggest that the combination of Soft D&S and data splitting as a\nfairness option is effective for dense data, whereas weighted majority voting\nis effective for sparse data. These findings should prove particularly valuable\nin supporting decision-making by human and machine-learning models with\nbalanced opinion aggregation.\n","authors":["Ryosuke Ueda","Koh Takeuchi","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2307.10749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10738v1","updated":"2023-07-20T10:04:55Z","published":"2023-07-20T10:04:55Z","title":"Fairness-Aware Client Selection for Federated Learning","summary":" Federated learning (FL) has enabled multiple data owners (a.k.a. FL clients)\nto train machine learning models collaboratively without revealing private\ndata. Since the FL server can only engage a limited number of clients in each\ntraining round, FL client selection has become an important research problem.\nExisting approaches generally focus on either enhancing FL model performance or\nenhancing the fair treatment of FL clients. The problem of balancing\nperformance and fairness considerations when selecting FL clients remains open.\nTo address this problem, we propose the Fairness-aware Federated Client\nSelection (FairFedCS) approach. Based on Lyapunov optimization, it dynamically\nadjusts FL clients' selection probabilities by jointly considering their\nreputations, times of participation in FL tasks and contributions to the\nresulting model performance. By not using threshold-based reputation filtering,\nit provides FL clients with opportunities to redeem their reputations after a\nperceived poor performance, thereby further enhancing fair client treatment.\nExtensive experiments based on real-world multimedia datasets show that\nFairFedCS achieves 19.6% higher fairness and 0.73% higher test accuracy on\naverage than the best-performing state-of-the-art approach.\n","authors":["Yuxin Shi","Zelei Liu","Zhuan Shi","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2307.10738v1.pdf","comment":"Accepted by ICME 2023"},{"id":"http://arxiv.org/abs/2307.10736v1","updated":"2023-07-20T10:03:50Z","published":"2023-07-20T10:03:50Z","title":"Long-Tail Theory under Gaussian Mixtures","summary":" We suggest a simple Gaussian mixture model for data generation that complies\nwith Feldman's long tail theory (2020). We demonstrate that a linear classifier\ncannot decrease the generalization error below a certain level in the proposed\nmodel, whereas a nonlinear classifier with a memorization capacity can. This\nconfirms that for long-tailed distributions, rare training examples must be\nconsidered for optimal generalization to new data. Finally, we show that the\nperformance gap between linear and nonlinear models can be lessened as the tail\nbecomes shorter in the subpopulation frequency distribution, as confirmed by\nexperiments on synthetic and real data.\n","authors":["Arman Bolatov","Maxat Tezekbayev","Igor Melnykov","Artur Pak","Vassilina Nikoulina","Zhenisbek Assylbekov"],"pdf_url":"https://arxiv.org/pdf/2307.10736v1.pdf","comment":"accepted to ECAI 2023"},{"id":"http://arxiv.org/abs/2307.10718v1","updated":"2023-07-20T09:24:23Z","published":"2023-07-20T09:24:23Z","title":"Differences Between Hard and Noisy-labeled Samples: An Empirical Study","summary":" Extracting noisy or incorrectly labeled samples from a labeled dataset with\nhard/difficult samples is an important yet under-explored topic. Two general\nand often independent lines of work exist, one focuses on addressing noisy\nlabels, and another deals with hard samples. However, when both types of data\nare present, most existing methods treat them equally, which results in a\ndecline in the overall performance of the model. In this paper, we first design\nvarious synthetic datasets with custom hardness and noisiness levels for\ndifferent samples. Our proposed systematic empirical study enables us to better\nunderstand the similarities and more importantly the differences between\nhard-to-learn samples and incorrectly-labeled samples. These controlled\nexperiments pave the way for the development of methods that distinguish\nbetween hard and noisy samples. Through our study, we introduce a simple yet\neffective metric that filters out noisy-labeled samples while keeping the hard\nsamples. We study various data partitioning methods in the presence of label\nnoise and observe that filtering out noisy samples from hard samples with this\nproposed metric results in the best datasets as evidenced by the high test\naccuracy achieved after models are trained on the filtered datasets. We\ndemonstrate this for both our created synthetic datasets and for datasets with\nreal-world label noise. Furthermore, our proposed data partitioning method\nsignificantly outperforms other methods when employed within a semi-supervised\nlearning framework.\n","authors":["Mahsa Forouzesh","Patrick Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.10718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10710v1","updated":"2023-07-20T09:05:46Z","published":"2023-07-20T09:05:46Z","title":"Reparameterized Policy Learning for Multimodal Trajectory Optimization","summary":" We investigate the challenge of parametrizing policies for reinforcement\nlearning (RL) in high-dimensional continuous action spaces. Our objective is to\ndevelop a multimodal policy that overcomes limitations inherent in the\ncommonly-used Gaussian parameterization. To achieve this, we propose a\nprincipled framework that models the continuous RL policy as a generative model\nof optimal trajectories. By conditioning the policy on a latent variable, we\nderive a novel variational bound as the optimization objective, which promotes\nexploration of the environment. We then present a practical model-based RL\nmethod, called Reparameterized Policy Gradient (RPG), which leverages the\nmultimodal policy parameterization and learned world model to achieve strong\nexploration capabilities and high data efficiency. Empirical results\ndemonstrate that our method can help agents evade local optima in tasks with\ndense rewards and solve challenging sparse-reward environments by incorporating\nan object-centric intrinsic reward. Our method consistently outperforms\nprevious approaches across a range of tasks. Code and supplementary materials\nare available on the project page https://haosulab.github.io/RPG/\n","authors":["Zhiao Huang","Litian Liang","Zhan Ling","Xuanlin Li","Chuang Gan","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.10710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10705v1","updated":"2023-07-20T08:53:47Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n Lane Segmentation in Self-Driving Cars","summary":" Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10704v1","updated":"2023-07-20T08:53:16Z","published":"2023-07-20T08:53:16Z","title":"Decentralized Smart Charging of Large-Scale EVs using Adaptive\n Multi-Agent Multi-Armed Bandits","summary":" The drastic growth of electric vehicles and photovoltaics can introduce new\nchallenges, such as electrical current congestion and voltage limit violations\ndue to peak load demands. These issues can be mitigated by controlling the\noperation of electric vehicles i.e., smart charging. Centralized smart charging\nsolutions have already been proposed in the literature. But such solutions may\nlack scalability and suffer from inherent drawbacks of centralization, such as\na single point of failure, and data privacy concerns. Decentralization can help\ntackle these challenges. In this paper, a fully decentralized smart charging\nsystem is proposed using the philosophy of adaptive multi-agent systems. The\nproposed system utilizes multi-armed bandit learning to handle uncertainties in\nthe system. The presented system is decentralized, scalable, real-time,\nmodel-free, and takes fairness among different players into account. A detailed\ncase study is also presented for performance evaluation.\n","authors":["Sharyal Zafar","Raphaël Feraud","Anne Blavette","Guy Camilleri","Hamid Ben"],"pdf_url":"https://arxiv.org/pdf/2307.10704v1.pdf","comment":"CIRED 2023 International Conference & Exhibition on Electricity\n Distribution, Jun 2023, Rome, Italy"},{"id":"http://arxiv.org/abs/2307.10703v1","updated":"2023-07-20T08:50:16Z","published":"2023-07-20T08:50:16Z","title":"Graphs in State-Space Models for Granger Causality in Climate Science","summary":" Granger causality (GC) is often considered not an actual form of causality.\nStill, it is arguably the most widely used method to assess the predictability\nof a time series from another one. Granger causality has been widely used in\nmany applied disciplines, from neuroscience and econometrics to Earth sciences.\nWe revisit GC under a graphical perspective of state-space models. For that, we\nuse GraphEM, a recently presented expectation-maximisation algorithm for\nestimating the linear matrix operator in the state equation of a\nlinear-Gaussian state-space model. Lasso regularisation is included in the\nM-step, which is solved using a proximal splitting Douglas-Rachford algorithm.\nExperiments in toy examples and challenging climate problems illustrate the\nbenefits of the proposed model and inference technique over standard Granger\ncausality methods.\n","authors":["Víctor Elvira","Émilie Chouzenoux","Jordi Cerdà","Gustau Camps-Valls"],"pdf_url":"https://arxiv.org/pdf/2307.10703v1.pdf","comment":"4 pages, 2 figures, 3 tables, CausalStats23: When Causal Inference\n meets Statistical Analysis, April 17-21, 2023, Paris, France"},{"id":"http://arxiv.org/abs/2205.09753v2","updated":"2023-07-20T08:41:46Z","published":"2022-04-30T07:08:30Z","title":"HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory\n Prediction via Scene Encoding","summary":" Encoding a driving scene into vector representations has been an essential\ntask for autonomous driving that can benefit downstream tasks e.g. trajectory\nprediction. The driving scene often involves heterogeneous elements such as the\ndifferent types of objects (agents, lanes, traffic signs) and the semantic\nrelations between objects are rich and diverse. Meanwhile, there also exist\nrelativity across elements, which means that the spatial relation is a relative\nconcept and need be encoded in a ego-centric manner instead of in a global\ncoordinate system. Based on these observations, we propose Heterogeneous\nDriving Graph Transformer (HDGT), a backbone modelling the driving scene as a\nheterogeneous graph with different types of nodes and edges. For heterogeneous\ngraph construction, we connect different types of nodes according to diverse\nsemantic relations. For spatial relation encoding, the coordinates of the node\nas well as its in-edges are in the local node-centric coordinate system. For\nthe aggregation module in the graph neural network (GNN), we adopt the\ntransformer structure in a hierarchical way to fit the heterogeneous nature of\ninputs. Experimental results show that HDGT achieves state-of-the-art\nperformance for the task of trajectory prediction, on INTERACTION Prediction\nChallenge and Waymo Open Motion Challenge.\n","authors":["Xiaosong Jia","Penghao Wu","Li Chen","Yu Liu","Hongyang Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2205.09753v2.pdf","comment":"Accepted at IEEE TPAMI in 2023. Code url:\n https://github.com/OpenDriveLab/HDGT"},{"id":"http://arxiv.org/abs/2307.10695v1","updated":"2023-07-20T08:38:01Z","published":"2023-07-20T08:38:01Z","title":"Self2Self+: Single-Image Denoising with Self-Supervised Learning and\n Image Quality Assessment Loss","summary":" Recently, denoising methods based on supervised learning have exhibited\npromising performance. However, their reliance on external datasets containing\nnoisy-clean image pairs restricts their applicability. To address this\nlimitation, researchers have focused on training denoising networks using\nsolely a set of noisy inputs. To improve the feasibility of denoising\nprocedures, in this study, we proposed a single-image self-supervised learning\nmethod in which only the noisy input image is used for network training. Gated\nconvolution was used for feature extraction and no-reference image quality\nassessment was used for guiding the training process. Moreover, the proposed\nmethod sampled instances from the input image dataset using Bernoulli sampling\nwith a certain dropout rate for training. The corresponding result was produced\nby averaging the generated predictions from various instances of the trained\nnetwork with dropouts. The experimental results indicated that the proposed\nmethod achieved state-of-the-art denoising performance on both synthetic and\nreal-world datasets. This highlights the effectiveness and practicality of our\nmethod as a potential solution for various noise removal tasks.\n","authors":["Jaekyun Ko","Sanghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10695v1.pdf","comment":"Technical report and supplemantry materials are combined into one\n paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18"},{"id":"http://arxiv.org/abs/2302.08292v3","updated":"2023-07-20T08:35:26Z","published":"2023-02-16T13:41:19Z","title":"Navya3DSeg -- Navya 3D Semantic Segmentation Dataset & split generation\n for autonomous vehicles","summary":" Autonomous driving (AD) perception today relies heavily on deep learning\nbased architectures requiring large scale annotated datasets with their\nassociated costs for curation and annotation. The 3D semantic data are useful\nfor core perception tasks such as obstacle detection and ego-vehicle\nlocalization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg),\nwith a diverse label space corresponding to a large scale production grade\noperational domain, including rural, urban, industrial sites and universities\nfrom 13 countries. It contains 23 labeled sequences and 25 supplementary\nsequences without labels, designed to explore self-supervised and\nsemi-supervised semantic segmentation benchmarks on point clouds. We also\npropose a novel method for sequential dataset split generation based on\niterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU\nimprovement over the original split proposed by SemanticKITTI dataset. A\ncomplete benchmark for semantic segmentation task was performed, with state of\nthe art methods. Finally, we demonstrate an Active Learning (AL) based dataset\ndistillation framework. We introduce a novel heuristic-free sampling method\ncalled ego-pose distance based sampling in the context of AL. A detailed\npresentation on the dataset is available here\nhttps://www.youtube.com/watch?v=5m6ALIs-s20.\n","authors":["Alexandre Almin","Léo Lemarié","Anh Duong","B Ravi Kiran"],"pdf_url":"https://arxiv.org/pdf/2302.08292v3.pdf","comment":"Accepted version to IEEE RA-L. Version with supplementary materials"},{"id":"http://arxiv.org/abs/2307.10683v1","updated":"2023-07-20T08:20:12Z","published":"2023-07-20T08:20:12Z","title":"Fractional Denoising for 3D Molecular Pre-training","summary":" Coordinate denoising is a promising 3D molecular pre-training method, which\nhas achieved remarkable performance in various downstream drug discovery tasks.\nTheoretically, the objective is equivalent to learning the force field, which\nis revealed helpful for downstream tasks. Nevertheless, there are two\nchallenges for coordinate denoising to learn an effective force field, i.e. low\ncoverage samples and isotropic force field. The underlying reason is that\nmolecular distributions assumed by existing denoising methods fail to capture\nthe anisotropic characteristic of molecules. To tackle these challenges, we\npropose a novel hybrid noise strategy, including noises on both dihedral angel\nand coordinate. However, denoising such hybrid noise in a traditional way is no\nmore equivalent to learning the force field. Through theoretical deductions, we\nfind that the problem is caused by the dependency of the input conformation for\ncovariance. To this end, we propose to decouple the two types of noise and\ndesign a novel fractional denoising method (Frad), which only denoises the\nlatter coordinate part. In this way, Frad enjoys both the merits of sampling\nmore low-energy structures and the force field equivalence. Extensive\nexperiments show the effectiveness of Frad in molecular representation, with a\nnew state-of-the-art on 9 out of 12 tasks of QM9 and on 7 out of 8 targets of\nMD17.\n","authors":["Shikun Feng","Yuyan Ni","Yanyan Lan","Zhi-Ming Ma","Wei-Ying Ma"],"pdf_url":"https://arxiv.org/pdf/2307.10683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10677v1","updated":"2023-07-20T07:57:14Z","published":"2023-07-20T07:57:14Z","title":"Deep learning for classification of noisy QR codes","summary":" We wish to define the limits of a classical classification model based on\ndeep learning when applied to abstract images, which do not represent visually\nidentifiable objects.QR codes (Quick Response codes) fall into this category of\nabstract images: one bit corresponding to one encoded character, QR codes were\nnot designed to be decoded manually. To understand the limitations of a deep\nlearning-based model for abstract image classification, we train an image\nclassification model on QR codes generated from information obtained when\nreading a health pass. We compare a classification model with a classical\n(deterministic) decoding method in the presence of noise. This study allows us\nto conclude that a model based on deep learning can be relevant for the\nunderstanding of abstract images.\n","authors":["Rebecca Leygonie","Sylvain Lobry"," )","Laurent Wendling (LIPADE)"],"pdf_url":"https://arxiv.org/pdf/2307.10677v1.pdf","comment":"in French language. RFIAP 2022 - Reconnaissance des Formes, Image,\n Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France"},{"id":"http://arxiv.org/abs/2307.07666v2","updated":"2023-07-20T07:55:04Z","published":"2023-07-15T00:26:51Z","title":"Efficient Action Robust Reinforcement Learning with Probabilistic Policy\n Execution Uncertainty","summary":" Robust reinforcement learning (RL) aims to find a policy that optimizes the\nworst-case performance in the face of uncertainties. In this paper, we focus on\naction robust RL with the probabilistic policy execution uncertainty, in which,\ninstead of always carrying out the action specified by the policy, the agent\nwill take the action specified by the policy with probability $1-\\rho$ and an\nalternative adversarial action with probability $\\rho$. We establish the\nexistence of an optimal policy on the action robust MDPs with probabilistic\npolicy execution uncertainty and provide the action robust Bellman optimality\nequation for its solution. Furthermore, we develop Action Robust Reinforcement\nLearning with Certificates (ARRLC) algorithm that achieves minimax optimal\nregret and sample complexity. Furthermore, we conduct numerical experiments to\nvalidate our approach's robustness, demonstrating that ARRLC outperforms\nnon-robust RL algorithms and converges faster than the robust TD algorithm in\nthe presence of action perturbations.\n","authors":["Guanlin Liu","Zhihan Zhou","Han Liu","Lifeng Lai"],"pdf_url":"https://arxiv.org/pdf/2307.07666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10655v1","updated":"2023-07-20T07:35:42Z","published":"2023-07-20T07:35:42Z","title":"A Survey of What to Share in Federated Learning: Perspectives on Model\n Utility, Privacy Leakage, and Communication Efficiency","summary":" Federated learning (FL) has emerged as a highly effective paradigm for\nprivacy-preserving collaborative training among different parties. Unlike\ntraditional centralized learning, which requires collecting data from each\nparty, FL allows clients to share privacy-preserving information without\nexposing private datasets. This approach not only guarantees enhanced privacy\nprotection but also facilitates more efficient and secure collaboration among\nmultiple participants. Therefore, FL has gained considerable attention from\nresearchers, promoting numerous surveys to summarize the related works.\nHowever, the majority of these surveys concentrate on methods sharing model\nparameters during the training process, while overlooking the potential of\nsharing other forms of local information. In this paper, we present a\nsystematic survey from a new perspective, i.e., what to share in FL, with an\nemphasis on the model utility, privacy leakage, and communication efficiency.\nThis survey differs from previous ones due to four distinct contributions.\nFirst, we present a new taxonomy of FL methods in terms of the sharing methods,\nwhich includes three categories of shared information: model sharing, synthetic\ndata sharing, and knowledge sharing. Second, we analyze the vulnerability of\ndifferent sharing methods to privacy attacks and review the defense mechanisms\nthat provide certain privacy guarantees. Third, we conduct extensive\nexperiments to compare the performance and communication overhead of various\nsharing methods in FL. Besides, we assess the potential privacy leakage through\nmodel inversion and membership inference attacks, while comparing the\neffectiveness of various defense approaches. Finally, we discuss potential\ndeficiencies in current methods and outline future directions for improvement.\n","authors":["Jiawei Shao","Zijian Li","Wenqiang Sun","Tailin Zhou","Yuchang Sun","Lumin Liu","Zehong Lin","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10654v1","updated":"2023-07-20T07:35:15Z","published":"2023-07-20T07:35:15Z","title":"Conditional expectation network for SHAP","summary":" A very popular model-agnostic technique for explaining predictive models is\nthe SHapley Additive exPlanation (SHAP). The two most popular versions of SHAP\nare a conditional expectation version and an unconditional expectation version\n(the latter is also known as interventional SHAP). Except for tree-based\nmethods, usually the unconditional version is used (for computational reasons).\nWe provide a (surrogate) neural network approach which allows us to efficiently\ncalculate the conditional version for both neural networks and other regression\nmodels, and which properly considers the dependence structure in the feature\ncomponents. This proposal is also useful to provide drop1 and anova analyses in\ncomplex regression models which are similar to their generalized linear model\n(GLM) counterparts, and we provide a partial dependence plot (PDP) counterpart\nthat considers the right dependence structure in the feature components.\n","authors":["Ronald Richman","Mario V. Wüthrich"],"pdf_url":"https://arxiv.org/pdf/2307.10654v1.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10653v1","updated":"2023-07-20T07:33:36Z","published":"2023-07-20T07:33:36Z","title":"Refining the Optimization Target for Automatic Univariate Time Series\n Anomaly Detection in Monitoring Services","summary":" Time series anomaly detection is crucial for industrial monitoring services\nthat handle a large volume of data, aiming to ensure reliability and optimize\nsystem performance. Existing methods often require extensive labeled resources\nand manual parameter selection, highlighting the need for automation. This\npaper proposes a comprehensive framework for automatic parameter optimization\nin time series anomaly detection models. The framework introduces three\noptimization targets: prediction score, shape score, and sensitivity score,\nwhich can be easily adapted to different model backbones without prior\nknowledge or manual labeling efforts. The proposed framework has been\nsuccessfully applied online for over six months, serving more than 50,000 time\nseries every minute. It simplifies the user's experience by requiring only an\nexpected sensitive value, offering a user-friendly interface, and achieving\ndesired detection results. Extensive evaluations conducted on public datasets\nand comparison with other methods further confirm the effectiveness of the\nproposed framework.\n","authors":["Manqing Dong","Zhanxiang Zhao","Yitong Geng","Wentao Li","Wei Wang","Huai Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.10653v1.pdf","comment":"Accepted by 2023 IJCAI Workshop"},{"id":"http://arxiv.org/abs/2307.10648v1","updated":"2023-07-20T07:23:15Z","published":"2023-07-20T07:23:15Z","title":"Data-Driven Latency Probability Prediction for Wireless Networks:\n Focusing on Tail Probabilities","summary":" With the emergence of new application areas, such as cyber-physical systems\nand human-in-the-loop applications, there is a need to guarantee a certain\nlevel of end-to-end network latency with extremely high reliability, e.g.,\n99.999%. While mechanisms specified under IEEE 802.1as time-sensitive\nnetworking (TSN) can be used to achieve these requirements for switched\nEthernet networks, implementing TSN mechanisms in wireless networks is\nchallenging due to their stochastic nature. To conform the wireless link to a\nreliability level of 99.999%, the behavior of extremely rare outliers in the\nlatency probability distribution, or the tail of the distribution, must be\nanalyzed and controlled. This work proposes predicting the tail of the latency\ndistribution using state-of-the-art data-driven approaches, such as mixture\ndensity networks (MDN) and extreme value mixture models, to estimate the\nlikelihood of rare latencies conditioned on the network parameters, which can\nbe used to make more informed decisions in wireless transmission. Actual\nlatency measurements of IEEE 802.11g (WiFi), commercial private and a\nsoftware-defined 5G network are used to benchmark the proposed approaches and\nevaluate their sensitivities concerning the tail probabilities.\n","authors":["Samie Mostafavi","Gourav Prateek Sharma","James Gross"],"pdf_url":"https://arxiv.org/pdf/2307.10648v1.pdf","comment":"Submitted to IEEE Global Communications (GLOBECOM) 2023 conference"},{"id":"http://arxiv.org/abs/2305.15776v2","updated":"2023-07-20T07:20:20Z","published":"2023-05-25T06:43:42Z","title":"AUC Optimization from Multiple Unlabeled Datasets","summary":" Weakly supervised learning aims to empower machine learning when the perfect\nsupervision is unavailable, which has drawn great attention from researchers.\nAmong various types of weak supervision, one of the most challenging cases is\nto learn from multiple unlabeled (U) datasets with only a little knowledge of\nthe class priors, or U$^m$ learning for short. In this paper, we study the\nproblem of building an AUC (area under ROC curve) optimization model from\nmultiple unlabeled datasets, which maximizes the pairwise ranking ability of\nthe classifier. We propose U$^m$-AUC, an AUC optimization approach that\nconverts the U$^m$ data into a multi-label AUC optimization problem, and can be\ntrained efficiently. We show that the proposed U$^m$-AUC is effective\ntheoretically and empirically.\n","authors":["Yu Liu","Zheng Xie","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2305.15776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10644v1","updated":"2023-07-20T07:14:58Z","published":"2023-07-20T07:14:58Z","title":"Fisher-Rao distance and pullback SPD cone distances between multivariate\n normal distributions","summary":" Data sets of multivariate normal distributions abound in many scientific\nareas like diffusion tensor imaging, structure tensor computer vision, radar\nsignal processing, machine learning, just to name a few. In order to process\nthose normal data sets for downstream tasks like filtering, classification or\nclustering, one needs to define proper notions of dissimilarities between\nnormals and paths joining them. The Fisher-Rao distance defined as the\nRiemannian geodesic distance induced by the Fisher information metric is such a\nprincipled metric distance which however is not known in closed-form excepts\nfor a few particular cases. In this work, we first report a fast and robust\nmethod to approximate arbitrarily finely the Fisher-Rao distance between\nmultivariate normal distributions. Second, we introduce a class of distances\nbased on diffeomorphic embeddings of the normal manifold into a submanifold of\nthe higher-dimensional symmetric positive-definite cone corresponding to the\nmanifold of centered normal distributions. We show that the projective Hilbert\ndistance on the cone yields a metric on the embedded normal submanifold and we\npullback that cone distance with its associated straight line Hilbert cone\ngeodesics to obtain a distance and smooth paths between normal distributions.\nCompared to the Fisher-Rao distance approximation, the pullback Hilbert cone\ndistance is computationally light since it requires to compute only the extreme\nminimal and maximal eigenvalues of matrices. Finally, we show how to use those\ndistances in clustering tasks.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2307.10644v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2208.06620v2","updated":"2023-07-20T07:09:45Z","published":"2022-08-13T10:36:04Z","title":"Opinion Market Model: Stemming Far-Right Opinion Spread using Positive\n Interventions","summary":" Online extremism has severe societal consequences, including normalizing hate\nspeech, user radicalization, and increased social divisions. Various mitigation\nstrategies have been explored to address these consequences. One such strategy\nuses positive interventions: controlled signals that add attention to the\nopinion ecosystem to boost certain opinions. To evaluate the effectiveness of\npositive interventions, we introduce the Opinion Market Model (OMM), a two-tier\nonline opinion ecosystem model that considers both inter-opinion interactions\nand the role of positive interventions. The size of the opinion attention\nmarket is modeled in the first tier using the multivariate discrete-time Hawkes\nprocess; in the second tier, opinions cooperate and compete for market share,\ngiven limited attention using the market share attraction model. We demonstrate\nthe convergence of our proposed estimation scheme on a synthetic dataset. Next,\nwe test OMM on two learning tasks, applying to two real-world datasets to\npredict attention market shares and uncover latent relationships between online\nitems. The first dataset comprises Facebook and Twitter discussions containing\nmoderate and far-right opinions about bushfires and climate change. The second\ndataset captures popular VEVO artists' YouTube and Twitter attention volumes.\nOMM outperforms the state-of-the-art predictive models on both datasets and\ncaptures latent cooperation-competition relations. We uncover (1) self- and\ncross-reinforcement between far-right and moderate opinions on the bushfires\nand (2) pairwise artist relations that correlate with real-world interactions\nsuch as collaborations and long-lasting feuds. Lastly, we use OMM as a testbed\nfor positive interventions and show how media coverage modulates the spread of\nfar-right opinions.\n","authors":["Pio Calderon","Rohit Ram","Marian-Andrei Rizoiu"],"pdf_url":"https://arxiv.org/pdf/2208.06620v2.pdf","comment":"accepted in the 18th AAAI International Conference on Web and Social\n Media (ICWSM'24)"},{"id":"http://arxiv.org/abs/2305.08396v3","updated":"2023-07-20T07:06:03Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" Convolutional Neural Networks (CNNs) have made significant strides in medical\nimage analysis in recent years. However, the local nature of the convolution\noperator may pose a limitation for capturing global and long-range interactions\nin CNNs. Recently, Transformers have gained popularity in the computer vision\ncommunity and also medical image segmentation due to their ability to process\nglobal features effectively. The scalability issues of self-attention mechanism\nand lack of the CNN-like inductive bias may have limited their adoption.\nTherefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages\nof both Convolution and Self-attention Mechanisms, have gained importance. In\nthis work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with\nnominal computational burden. The inclusion of multi-axis self-attention,\nwithin each decoder stage, significantly enhances the discriminating capacity\nbetween the object and background regions, and thereby helps in improving the\nsegmentation efficiency. In the Hybrid Decoder block, the fusion process\ncommences by integrating the upsampled lower level decoder features, obtained\nthrough transpose convolution, with the skip-connection features derived from\nthe hybrid encoder. Subsequently, the fused features undergo refinement through\nthe utilization of a multi-axis attention mechanism. The proposed decoder block\nis repeated multiple times to progressively segment the nuclei regions.\nExperimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the\neffectiveness of the proposed technique.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10635v1","updated":"2023-07-20T07:01:57Z","published":"2023-07-20T07:01:57Z","title":"SciBench: Evaluating College-Level Scientific Problem-Solving Abilities\n of Large Language Models","summary":" Recent advances in large language models (LLMs) have demonstrated notable\nprogress on many mathematical benchmarks. However, most of these benchmarks\nonly feature problems grounded in junior and senior high school subjects,\ncontain only multiple-choice questions, and are confined to a limited scope of\nelementary arithmetic operations. To address these issues, this paper\nintroduces an expansive benchmark suite SciBench that aims to systematically\nexamine the reasoning capabilities required for complex scientific problem\nsolving. SciBench contains two carefully curated datasets: an open set\nfeaturing a range of collegiate-level scientific problems drawn from\nmathematics, chemistry, and physics textbooks, and a closed set comprising\nproblems from undergraduate-level exams in computer science and mathematics.\nBased on the two datasets, we conduct an in-depth benchmark study of two\nrepresentative LLMs with various prompting strategies. The results reveal that\ncurrent LLMs fall short of delivering satisfactory performance, with an overall\nscore of merely 35.80%. Furthermore, through a detailed user study, we\ncategorize the errors made by LLMs into ten problem-solving abilities. Our\nanalysis indicates that no single prompting strategy significantly outperforms\nothers and some strategies that demonstrate improvements in certain\nproblem-solving skills result in declines in other skills. We envision that\nSciBench will catalyze further developments in the reasoning abilities of LLMs,\nthereby ultimately contributing to scientific research and discovery.\n","authors":["Xiaoxuan Wang","Ziniu Hu","Pan Lu","Yanqiao Zhu","Jieyu Zhang","Satyen Subramaniam","Arjun R. Loomba","Shichang Zhang","Yizhou Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10635v1.pdf","comment":"Work in progress, 18 pages"},{"id":"http://arxiv.org/abs/2307.10634v1","updated":"2023-07-20T06:59:02Z","published":"2023-07-20T06:59:02Z","title":"Generative Language Models on Nucleotide Sequences of Human Genes","summary":" Language models, primarily transformer-based ones, obtained colossal success\nin NLP. To be more precise, studies like BERT in NLU and works such as GPT-3\nfor NLG are very crucial. DNA sequences are very close to natural language in\nterms of structure, so if the DNA-related bioinformatics domain is concerned,\ndiscriminative models, like DNABert, exist. Yet, the generative side of the\ncoin is mainly unexplored to the best of our knowledge. Consequently, we\nfocused on developing an autoregressive generative language model like GPT-3\nfor DNA sequences. Because working with whole DNA sequences is challenging\nwithout substantial computational resources, we decided to carry out our study\non a smaller scale, focusing on nucleotide sequences of human genes, unique\nparts in DNA with specific functionalities, instead of the whole DNA. This\ndecision did not change the problem structure a lot due to the fact that both\nDNA and genes can be seen as 1D sequences consisting of four different\nnucleotides without losing much information and making too much simplification.\nFirst of all, we systematically examined an almost entirely unexplored problem\nand observed that RNNs performed the best while simple techniques like N-grams\nwere also promising. Another beneficial point was learning how to work with\ngenerative models on languages we do not understand, unlike natural language.\nHow essential using real-life tasks beyond the classical metrics such as\nperplexity is observed. Furthermore, checking whether the data-hungry nature of\nthese models can be changed through selecting a language with minimal\nvocabulary size, four owing to four different types of nucleotides, is\nexamined. The reason for reviewing this was that choosing such a language might\nmake the problem easier. However, what we observed in this study was it did not\nprovide that much of a change in the amount of data needed.\n","authors":["Musa Nuri Ihtiyar","Arzucan Ozgur"],"pdf_url":"https://arxiv.org/pdf/2307.10634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10633v1","updated":"2023-07-20T06:58:55Z","published":"2023-07-20T06:58:55Z","title":"Multi-Method Self-Training: Improving Code Generation With Text, And\n Vice Versa","summary":" Large Language Models have many methods for solving the same problem. This\nintroduces novel strengths (different methods may work well for different\nproblems) and weaknesses (it may be difficult for users to know which method to\nuse). In this paper, we introduce Multi-Method Self-Training (MMST), where one\nmethod is trained on the filtered outputs of another, allowing us to augment\nthe strengths and ameliorate the weaknesses of each method. Using a 176B\nparameter model trained on both language and code, we show that MMST can 1)\nimprove the less performant method (up to 30%) making the model easier to use,\n2) improve the more performant method (up to 32.2%) making the model more\nperformant, and 3) improve the performance of related but distinct tasks (up to\n10.3%) by improving the ability of the model to generate rationales. We then\nconduct ablation analyses to explore why MMST works. We show that MMST\ngenerates more data than traditional self-training, but the improvement in\nperformance is driven by the use of multiple methods. We also analyze\nprompt-engineering and anti-correlated performance between methods as means of\nmaking MMST more effective. We hope the evidence from our paper motivates\nmachine learning researchers to explore ways in which advances in language\nmodels allow for new forms of training.\n","authors":["Shriyash K. Upadhyay","Etan J. Ginsberg"],"pdf_url":"https://arxiv.org/pdf/2307.10633v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2211.14085v3","updated":"2023-07-20T06:42:56Z","published":"2022-11-25T13:14:33Z","title":"Positive unlabeled learning with tensor networks","summary":" Positive unlabeled learning is a binary classification problem with positive\nand unlabeled data. It is common in domains where negative labels are costly or\nimpossible to obtain, e.g., medicine and personalized advertising. Most\napproaches to positive unlabeled learning apply to specific data types (e.g.,\nimages, categorical data) and can not generate new positive and negative\nsamples. This work introduces a feature-space distance-based tensor network\napproach to the positive unlabeled learning problem. The presented method is\nnot domain specific and significantly improves the state-of-the-art results on\nthe MNIST image and 15 categorical/mixed datasets. The trained tensor network\nmodel is also a generative model and enables the generation of new positive and\nnegative instances.\n","authors":["Bojan Žunkovič"],"pdf_url":"https://arxiv.org/pdf/2211.14085v3.pdf","comment":"12 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.10617v1","updated":"2023-07-20T06:35:43Z","published":"2023-07-20T06:35:43Z","title":"Detecting deceptive reviews using text classification","summary":" In recent years, online reviews play a vital role for promoting any kind of\nproduct or services. Businesses may embed fake reviews in order to attract\ncustomers to purchase their products. They may even highlight the benefits of\ntheir own product or criticize the competition's product. Marketers,\nadvertisers, and other online business users have incentive to create fake\npositive reviews for products which they want to promote or give fake negative\nreviews for products which they really don't like. So now-a-days writing a\ndeceptive review is inevitable thing for promoting their own business or\ndegrading competitor's reputation. Thus, identifying deceptive reviews is an\nintense and on-going research area. This research paper proposes machine\nlearning model approach to identify deceptive reviews. The paper investigates\nthe performance of the several experiments done on a Deceptive Opinion Spam\nCorpus dataset of restaurants reviews. We developed a n-gram model and max\nfeatures to identify deceptive contents with a particular focus on fake\nreviews. Further, we conduct a benchmark study to investigate the performance\nof two different features extraction techniques and apply five machine learning\nclassification techniques. The experimental results show that passive\naggressive classifier outperforms other algorithms, and it reaches the highest\naccuracy not only in text classification but also to fake reviews. We also\nstudy the data augmentation and implement different deep learning techniques.\n","authors":["Anusuya Baby"],"pdf_url":"https://arxiv.org/pdf/2307.10617v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2307.09018v2","updated":"2023-07-20T06:35:34Z","published":"2023-07-18T07:12:46Z","title":"Multimodal LLMs for health grounded in individual-specific data","summary":" Foundation large language models (LLMs) have shown an impressive ability to\nsolve tasks across a wide range of fields including health. To effectively\nsolve personalized health tasks, LLMs need the ability to ingest a diversity of\ndata modalities that are relevant to an individual's health status. In this\npaper, we take a step towards creating multimodal LLMs for health that are\ngrounded in individual-specific data by developing a framework (HeLM: Health\nLarge Language Model for Multimodal Understanding) that enables LLMs to use\nhigh-dimensional clinical modalities to estimate underlying disease risk. HeLM\nencodes complex data modalities by learning an encoder that maps them into the\nLLM's token embedding space and for simple modalities like tabular data by\nserializing the data into text. Using data from the UK Biobank, we show that\nHeLM can effectively use demographic and clinical features in addition to\nhigh-dimensional time-series data to estimate disease risk. For example, HeLM\nachieves an AUROC of 0.75 for asthma prediction when combining tabular and\nspirogram data modalities compared with 0.49 when only using tabular data.\nOverall, we find that HeLM outperforms or performs at parity with classical\nmachine learning approaches across a selection of eight binary traits.\nFurthermore, we investigate the downstream uses of this model such as its\ngeneralizability to out-of-distribution traits and its ability to power\nconversations around individual health and wellness.\n","authors":["Anastasiya Belyaeva","Justin Cosentino","Farhad Hormozdiari","Krish Eswaran","Shravya Shetty","Greg Corrado","Andrew Carroll","Cory Y. McLean","Nicholas A. Furlotte"],"pdf_url":"https://arxiv.org/pdf/2307.09018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10616v1","updated":"2023-07-20T06:32:14Z","published":"2023-07-20T06:32:14Z","title":"Heterogeneous Federated Learning: State-of-the-art and Research\n Challenges","summary":" Federated learning (FL) has drawn increasing attention owing to its potential\nuse in large-scale industrial applications. Existing federated learning works\nmainly focus on model homogeneous settings. However, practical federated\nlearning typically faces the heterogeneity of data distributions, model\narchitectures, network environments, and hardware devices among participant\nclients. Heterogeneous Federated Learning (HFL) is much more challenging, and\ncorresponding solutions are diverse and complex. Therefore, a systematic survey\non this topic about the research challenges and state-of-the-art is essential.\nIn this survey, we firstly summarize the various research challenges in HFL\nfrom five aspects: statistical heterogeneity, model heterogeneity,\ncommunication heterogeneity, device heterogeneity, and additional challenges.\nIn addition, recent advances in HFL are reviewed and a new taxonomy of existing\nHFL methods is proposed with an in-depth analysis of their pros and cons. We\nclassify existing methods from three different levels according to the HFL\nprocedure: data-level, model-level, and server-level. Finally, several critical\nand promising future research directions in HFL are discussed, which may\nfacilitate further developments in this field. A periodically updated\ncollection on HFL is available at https://github.com/marswhu/HFL_Survey.\n","authors":["Mang Ye","Xiuwen Fang","Bo Du","Pong C. Yuen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.10616v1.pdf","comment":"42 pages, 11 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2305.18088v4","updated":"2023-07-20T06:29:28Z","published":"2023-05-25T05:34:39Z","title":"Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking\n and Machine Learning Regression Approach","summary":" The COVID-19 pandemic has created a global health crisis, driving the need\nfor the rapid identification of potential therapeutics. To meet this challenge,\ndrug repurposing is the only solution with saving cost, time, and labor. In\nthis study, we used the Zinc database to screen the world-approved including\nFDA-approved 5903 drugs for repurposing as potential COVID-19 treatments\ntargeting the main protease 3CL of SARS-CoV-2. We performed molecular docking\nand checked the efficacy of drug molecules. To enhance the efficiency of drug\nrepurposing approach, we modeled the binding affinities using several machine\nlearning regression approaches for QSAR modeling such as decision tree, extra\ntrees, MLP, KNN, XGBoost, and gradient boosting. The computational results\ndemonstrated that Decision Tree Regression (DTR) model has improved statistical\nmeasures of R2 and RMSE. These simulated results helped to identify drugs with\nhigh binding affinity. From the docking and other statistical analysis, we\nshortlisted six promising drugs with their respective Zinc IDs (ZINC3873365,\nZINC85432544, ZINC203757351, ZINC85536956, ZINC8214470 and ZINC261494640)\nwithin the range of -15 kcal/mol to -13 kcal/mol. In the study, the repurposed\ndrugs are novel except ZINC203757351 antiviral compound that has already\nidentified against COVID-19 in other studies. Further, we analyzed the\nphysiochemical and pharmacokinetic properties of these top-ranked selected\ndrugs with respect to their best binding interaction for specific target\nprotease 3CLpro. Our study has provided an efficient framework for drug\nrepurposing against COVID-19. This highlights the potential of combining\nmolecular docking with machine learning regression approaches to accelerate the\nidentification of potential therapeutic candidates.\n","authors":["Imra Aqeel","Abdul Majid"],"pdf_url":"https://arxiv.org/pdf/2305.18088v4.pdf","comment":"27 Pages"},{"id":"http://arxiv.org/abs/2102.03403v2","updated":"2023-07-20T05:58:30Z","published":"2021-02-05T19:59:05Z","title":"Robust Principal Component Analysis: A Median of Means Approach","summary":" Principal Component Analysis (PCA) is a fundamental tool for data\nvisualization, denoising, and dimensionality reduction. It is widely popular in\nStatistics, Machine Learning, Computer Vision, and related fields. However, PCA\nis well-known to fall prey to outliers and often fails to detect the true\nunderlying low-dimensional structure within the dataset. Following the Median\nof Means (MoM) philosophy, recent supervised learning methods have shown great\nsuccess in dealing with outlying observations without much compromise to their\nlarge sample theoretical properties. This paper proposes a PCA procedure based\non the MoM principle. Called the \\textbf{M}edian of \\textbf{M}eans\n\\textbf{P}rincipal \\textbf{C}omponent \\textbf{A}nalysis (MoMPCA), the proposed\nmethod is not only computationally appealing but also achieves optimal\nconvergence rates under minimal assumptions. In particular, we explore the\nnon-asymptotic error bounds of the obtained solution via the aid of the\nRademacher complexities while granting absolutely no assumption on the outlying\nobservations. The derived concentration results are not dependent on the\ndimension because the analysis is conducted in a separable Hilbert space, and\nthe results only depend on the fourth moment of the underlying distribution in\nthe corresponding norm. The proposal's efficacy is also thoroughly showcased\nthrough simulations and real data applications.\n","authors":["Debolina Paul","Saptarshi Chakraborty","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2102.03403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10224v4","updated":"2023-07-20T05:42:46Z","published":"2022-08-14T02:41:05Z","title":"Friendly Noise against Adversarial Noise: A Powerful Defense against\n Data Poisoning Attacks","summary":" A powerful category of (invisible) data poisoning attacks modify a subset of\ntraining examples by small adversarial perturbations to change the prediction\nof certain test-time data. Existing defense mechanisms are not desirable to\ndeploy in practice, as they often either drastically harm the generalization\nperformance, or are attack-specific, and prohibitively slow to apply. Here, we\npropose a simple but highly effective approach that unlike existing methods\nbreaks various types of invisible poisoning attacks with the slightest drop in\nthe generalization performance. We make the key observation that attacks\nintroduce local sharp regions of high training loss, which when minimized,\nresults in learning the adversarial perturbations and makes the attack\nsuccessful. To break poisoning attacks, our key idea is to alleviate the sharp\nloss regions introduced by poisons. To do so, our approach comprises two\ncomponents: an optimized friendly noise that is generated to maximally perturb\nexamples without degrading the performance, and a randomly varying noise\ncomponent. The combination of both components builds a very light-weight but\nextremely effective defense against the most powerful triggerless targeted and\nhidden-trigger backdoor poisoning attacks, including Gradient Matching,\nBulls-eye Polytope, and Sleeper Agent. We show that our friendly noise is\ntransferable to other architectures, and adaptive attacks cannot break our\ndefense due to its random noise component. Our code is available at:\nhttps://github.com/tianyu139/friendly-noise\n","authors":["Tian Yu Liu","Yu Yang","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2208.10224v4.pdf","comment":"Code available at: https://github.com/tianyu139/friendly-noise"},{"id":"http://arxiv.org/abs/2210.08363v3","updated":"2023-07-20T05:41:18Z","published":"2022-10-15T19:32:20Z","title":"Data-Efficient Augmentation for Training Neural Networks","summary":" Data augmentation is essential to achieve state-of-the-art performance in\nmany deep learning applications. However, the most effective augmentation\ntechniques become computationally prohibitive for even medium-sized datasets.\nTo address this, we propose a rigorous technique to select subsets of data\npoints that when augmented, closely capture the training dynamics of full data\naugmentation. We first show that data augmentation, modeled as additive\nperturbations, improves learning and generalization by relatively enlarging and\nperturbing the smaller singular values of the network Jacobian, while\npreserving its prominent directions. This prevents overfitting and enhances\nlearning the harder to learn information. Then, we propose a framework to\niteratively extract small subsets of training data that when augmented, closely\ncapture the alignment of the fully augmented Jacobian with labels/residuals. We\nprove that stochastic gradient descent applied to the augmented subsets found\nby our approach has similar training dynamics to that of fully augmented data.\nOur experiments demonstrate that our method achieves 6.3x speedup on CIFAR10\nand 2.2x speedup on SVHN, and outperforms the baselines by up to 10% across\nvarious subset sizes. Similarly, on TinyImageNet and ImageNet, our method beats\nthe baselines by up to 8%, while achieving up to 3.3x speedup across various\nsubset sizes. Finally, training on and augmenting 50% subsets using our method\non a version of CIFAR10 corrupted with label noise even outperforms using the\nfull dataset. Our code is available at:\nhttps://github.com/tianyu139/data-efficient-augmentation\n","authors":["Tian Yu Liu","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2210.08363v3.pdf","comment":"Code available at:\n https://github.com/tianyu139/data-efficient-augmentation"},{"id":"http://arxiv.org/abs/2206.08309v2","updated":"2023-07-20T05:32:00Z","published":"2022-06-16T17:11:41Z","title":"Pythae: Unifying Generative Autoencoders in Python -- A Benchmarking Use\n Case","summary":" In recent years, deep generative models have attracted increasing interest\ndue to their capacity to model complex distributions. Among those models,\nvariational autoencoders have gained popularity as they have proven both to be\ncomputationally efficient and yield impressive results in multiple fields.\nFollowing this breakthrough, extensive research has been done in order to\nimprove the original publication, resulting in a variety of different VAE\nmodels in response to different tasks. In this paper we present Pythae, a\nversatile open-source Python library providing both a unified implementation\nand a dedicated framework allowing straightforward, reproducible and reliable\nuse of generative autoencoder models. We then propose to use this library to\nperform a case study benchmark where we present and compare 19 generative\nautoencoder models representative of some of the main improvements on\ndownstream tasks such as image reconstruction, generation, classification,\nclustering and interpolation. The open-source library can be found at\nhttps://github.com/clementchadebec/benchmark_VAE.\n","authors":["Clément Chadebec","Louis J. Vincent","Stéphanie Allassonnière"],"pdf_url":"https://arxiv.org/pdf/2206.08309v2.pdf","comment":"Accepted to NeurIPS 2022"},{"id":"http://arxiv.org/abs/2210.16299v3","updated":"2023-07-20T05:27:03Z","published":"2022-10-28T17:52:18Z","title":"Nonuniqueness and Convergence to Equivalent Solutions in Observer-based\n Inverse Reinforcement Learning","summary":" A key challenge in solving the deterministic inverse reinforcement learning\n(IRL) problem online and in real-time is the existence of multiple solutions.\nNonuniqueness necessitates the study of the notion of equivalent solutions,\ni.e., solutions that result in a different cost functional but same feedback\nmatrix, and convergence to such solutions. While offline algorithms that result\nin convergence to equivalent solutions have been developed in the literature,\nonline, real-time techniques that address nonuniqueness are not available. In\nthis paper, a regularized history stack observer that converges to\napproximately equivalent solutions of the IRL problem is developed. Novel\ndata-richness conditions are developed to facilitate the analysis and\nsimulation results are provided to demonstrate the effectiveness of the\ndeveloped technique.\n","authors":["Jared Town","Zachary Morrison","Rushikesh Kamalapurkar"],"pdf_url":"https://arxiv.org/pdf/2210.16299v3.pdf","comment":"16 pages, 7 figures, submitted to American Controls Conference 2023"},{"id":"http://arxiv.org/abs/2307.10596v1","updated":"2023-07-20T05:23:49Z","published":"2023-07-20T05:23:49Z","title":"Ensemble Learning based Anomaly Detection for IoT Cybersecurity via\n Bayesian Hyperparameters Sensitivity Analysis","summary":" The Internet of Things (IoT) integrates more than billions of intelligent\ndevices over the globe with the capability of communicating with other\nconnected devices with little to no human intervention. IoT enables data\naggregation and analysis on a large scale to improve life quality in many\ndomains. In particular, data collected by IoT contain a tremendous amount of\ninformation for anomaly detection. The heterogeneous nature of IoT is both a\nchallenge and an opportunity for cybersecurity. Traditional approaches in\ncybersecurity monitoring often require different kinds of data pre-processing\nand handling for various data types, which might be problematic for datasets\nthat contain heterogeneous features. However, heterogeneous types of network\ndevices can often capture a more diverse set of signals than a single type of\ndevice readings, which is particularly useful for anomaly detection. In this\npaper, we present a comprehensive study on using ensemble machine learning\nmethods for enhancing IoT cybersecurity via anomaly detection. Rather than\nusing one single machine learning model, ensemble learning combines the\npredictive power from multiple models, enhancing their predictive accuracy in\nheterogeneous datasets rather than using one single machine learning model. We\npropose a unified framework with ensemble learning that utilises Bayesian\nhyperparameter optimisation to adapt to a network environment that contains\nmultiple IoT sensor readings. Experimentally, we illustrate their high\npredictive power when compared to traditional methods.\n","authors":["Tin Lai","Farnaz Farid","Abubakar Bello","Fariza Sabrina"],"pdf_url":"https://arxiv.org/pdf/2307.10596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10588v1","updated":"2023-07-20T05:03:25Z","published":"2023-07-20T05:03:25Z","title":"Forecasting Battery Electric Vehicle Charging Behavior: A Deep Learning\n Approach Equipped with Micro-Clustering and SMOTE Techniques","summary":" Energy systems, climate change, and public health are among the primary\nreasons for moving toward electrification in transportation. Transportation\nelectrification is being promoted worldwide to reduce emissions. As a result,\nmany automakers will soon start making only battery electric vehicles (BEVs).\nBEV adoption rates are rising in California, mainly due to climate change and\nair pollution concerns. While great for climate and pollution goals, improperly\nmanaged BEV charging can lead to insufficient charging infrastructure and power\noutages. This study develops a novel Micro Clustering Deep Neural Network\n(MCDNN), an artificial neural network algorithm that is highly effective at\nlearning BEVs trip and charging data to forecast BEV charging events,\ninformation that is essential for electricity load aggregators and utility\nmanagers to provide charging stations and electricity capacity effectively. The\nMCDNN is configured using a robust dataset of trips and charges that occurred\nin California between 2015 and 2020 from 132 BEVs, spanning 5 BEV models for a\ntotal of 1570167 vehicle miles traveled. The numerical findings revealed that\nthe proposed MCDNN is more effective than benchmark approaches in this field,\nsuch as support vector machine, k nearest neighbors, decision tree, and other\nneural network-based models in predicting the charging events.\n","authors":["Hanif Tayarani","Trisha V. Ramadoss","Vaishnavi Karanam","Gil Tal","Christopher Nitta"],"pdf_url":"https://arxiv.org/pdf/2307.10588v1.pdf","comment":"18 pages,8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.10586v1","updated":"2023-07-20T05:00:13Z","published":"2023-07-20T05:00:13Z","title":"A Holistic Assessment of the Reliability of Machine Learning Systems","summary":" As machine learning (ML) systems increasingly permeate high-stakes settings\nsuch as healthcare, transportation, military, and national security, concerns\nregarding their reliability have emerged. Despite notable progress, the\nperformance of these systems can significantly diminish due to adversarial\nattacks or environmental changes, leading to overconfident predictions,\nfailures to detect input faults, and an inability to generalize in unexpected\nscenarios. This paper proposes a holistic assessment methodology for the\nreliability of ML systems. Our framework evaluates five key properties:\nin-distribution accuracy, distribution-shift robustness, adversarial\nrobustness, calibration, and out-of-distribution detection. A reliability score\nis also introduced and used to assess the overall system reliability. To\nprovide insights into the performance of different algorithmic approaches, we\nidentify and categorize state-of-the-art techniques, then evaluate a selection\non real-world tasks using our proposed reliability metrics and reliability\nscore. Our analysis of over 500 models reveals that designing for one metric\ndoes not necessarily constrain others but certain algorithmic techniques can\nimprove reliability across multiple metrics simultaneously. This study\ncontributes to a more comprehensive understanding of ML reliability and\nprovides a roadmap for future research and development.\n","authors":["Anthony Corso","David Karamadian","Romeo Valentin","Mary Cooper","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2307.10586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10580v1","updated":"2023-07-20T04:46:34Z","published":"2023-07-20T04:46:34Z","title":"Intelligent model for offshore China sea fog forecasting","summary":" Accurate and timely prediction of sea fog is very important for effectively\nmanaging maritime and coastal economic activities. Given the intricate nature\nand inherent variability of sea fog, traditional numerical and statistical\nforecasting methods are often proven inadequate. This study aims to develop an\nadvanced sea fog forecasting method embedded in a numerical weather prediction\nmodel using the Yangtze River Estuary (YRE) coastal area as a case study. Prior\nto training our machine learning model, we employ a time-lagged correlation\nanalysis technique to identify key predictors and decipher the underlying\nmechanisms driving sea fog occurrence. In addition, we implement ensemble\nlearning and a focal loss function to address the issue of imbalanced data,\nthereby enhancing the predictive ability of our model. To verify the accuracy\nof our method, we evaluate its performance using a comprehensive dataset\nspanning one year, which encompasses both weather station observations and\nhistorical forecasts. Remarkably, our machine learning-based approach surpasses\nthe predictive performance of two conventional methods, the weather research\nand forecasting nonhydrostatic mesoscale model (WRF-NMM) and the algorithm\ndeveloped by the National Oceanic and Atmospheric Administration (NOAA)\nForecast Systems Laboratory (FSL). Specifically, in regard to predicting sea\nfog with a visibility of less than or equal to 1 km with a lead time of 60\nhours, our methodology achieves superior results by increasing the probability\nof detection (POD) while simultaneously reducing the false alarm ratio (FAR).\n","authors":["Yanfei Xiang","Qinghong Zhang","Mingqing Wang","Ruixue Xia","Yang Kong","Xiaomeng Huang"],"pdf_url":"https://arxiv.org/pdf/2307.10580v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10579v1","updated":"2023-07-20T04:45:59Z","published":"2023-07-20T04:45:59Z","title":"SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning","summary":" SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to\nprotect data privacy in vertical federated learning setting. It is widely used\nin fields such as finance and healthcare due to its interpretability,\neffectiveness, and privacy-preserving capability. However, SecureBoost suffers\nfrom high computational complexity and risk of label leakage. To harness the\nfull potential of SecureBoost, hyperparameters of SecureBoost should be\ncarefully chosen to strike an optimal balance between utility, efficiency, and\nprivacy. Existing methods either set hyperparameters empirically or\nheuristically, which are far from optimal. To fill this gap, we propose a\nConstrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto\noptimal solutions that each solution is a set of hyperparameters achieving\noptimal tradeoff between utility loss, training cost, and privacy leakage. We\ndesign measurements of the three objectives. In particular, the privacy leakage\nis measured using our proposed instance clustering attack. Experimental results\ndemonstrate that the CMOSB yields not only hyperparameters superior to the\nbaseline but also optimal sets of hyperparameters that can support the flexible\nrequirements of FL participants.\n","authors":["Ziyao Ren","Yan Kang","Lixin Fan","Linghua Yang","Tao Fan","Yongxin Tong","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10575v1","updated":"2023-07-20T04:35:50Z","published":"2023-07-20T04:35:50Z","title":"Boosting Federated Learning Convergence with Prototype Regularization","summary":" As a distributed machine learning technique, federated learning (FL) requires\nclients to collaboratively train a shared model with an edge server without\nleaking their local data. However, the heterogeneous data distribution among\nclients often leads to a decrease in model performance. To tackle this issue,\nthis paper introduces a prototype-based regularization strategy to address the\nheterogeneity in the data distribution. Specifically, the regularization\nprocess involves the server aggregating local prototypes from distributed\nclients to generate a global prototype, which is then sent back to the\nindividual clients to guide their local training. The experimental results on\nMNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3%\nand 8.9% in average test accuracy, respectively, compared to the most popular\nbaseline FedAvg. Furthermore, our approach has a fast convergence rate in\nheterogeneous settings.\n","authors":["Yu Qiao","Huy Q. Le","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2307.10575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10569v1","updated":"2023-07-20T04:14:09Z","published":"2023-07-20T04:14:09Z","title":"Deceptive Alignment Monitoring","summary":" As the capabilities of large machine learning models continue to grow, and as\nthe autonomy afforded to such models continues to expand, the spectre of a new\nadversary looms: the models themselves. The threat that a model might behave in\na seemingly reasonable manner, while secretly and subtly modifying its behavior\nfor ulterior reasons is often referred to as deceptive alignment in the AI\nSafety & Alignment communities. Consequently, we call this new direction\nDeceptive Alignment Monitoring. In this work, we identify emerging directions\nin diverse machine learning subfields that we believe will become increasingly\nimportant and intertwined in the near future for deceptive alignment\nmonitoring, and we argue that advances in these fields present both long-term\nchallenges and new research opportunities. We conclude by advocating for\ngreater involvement by the adversarial machine learning community in these\nemerging directions.\n","authors":["Andres Carranza","Dhruv Pai","Rylan Schaeffer","Arnuv Tandon","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2307.10569v1.pdf","comment":"Accepted as BlueSky Oral to 2023 ICML AdvML Workshop"},{"id":"http://arxiv.org/abs/2307.10563v1","updated":"2023-07-20T04:00:37Z","published":"2023-07-20T04:00:37Z","title":"FACADE: A Framework for Adversarial Circuit Anomaly Detection and\n Evaluation","summary":" We present FACADE, a novel probabilistic and geometric framework designed for\nunsupervised mechanistic anomaly detection in deep neural networks. Its primary\ngoal is advancing the understanding and mitigation of adversarial attacks.\nFACADE aims to generate probabilistic distributions over circuits, which\nprovide critical insights to their contribution to changes in the manifold\nproperties of pseudo-classes, or high-dimensional modes in activation space,\nyielding a powerful tool for uncovering and combating adversarial attacks. Our\napproach seeks to improve model robustness, enhance scalable model oversight,\nand demonstrates promising applications in real-world deployment settings.\n","authors":["Dhruv Pai","Andres Carranza","Rylan Schaeffer","Arnuv Tandon","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2307.10563v1.pdf","comment":"Accepted as BlueSky Poster at 2023 ICML AdvML Workshop"},{"id":"http://arxiv.org/abs/2307.10562v1","updated":"2023-07-20T03:56:04Z","published":"2023-07-20T03:56:04Z","title":"Shared Adversarial Unlearning: Backdoor Mitigation by Unlearning Shared\n Adversarial Examples","summary":" Backdoor attacks are serious security threats to machine learning models\nwhere an adversary can inject poisoned samples into the training set, causing a\nbackdoored model which predicts poisoned samples with particular triggers to\nparticular target classes, while behaving normally on benign samples. In this\npaper, we explore the task of purifying a backdoored model using a small clean\ndataset. By establishing the connection between backdoor risk and adversarial\nrisk, we derive a novel upper bound for backdoor risk, which mainly captures\nthe risk on the shared adversarial examples (SAEs) between the backdoored model\nand the purified model. This upper bound further suggests a novel bi-level\noptimization problem for mitigating backdoor using adversarial training\ntechniques. To solve it, we propose Shared Adversarial Unlearning (SAU).\nSpecifically, SAU first generates SAEs, and then, unlearns the generated SAEs\nsuch that they are either correctly classified by the purified model and/or\ndifferently classified by the two models, such that the backdoor effect in the\nbackdoored model will be mitigated in the purified model. Experiments on\nvarious benchmark datasets and network architectures show that our proposed\nmethod achieves state-of-the-art performance for backdoor defense.\n","authors":["Shaokui Wei","Mingda Zhang","Hongyuan Zha","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10560v1","updated":"2023-07-20T03:55:53Z","published":"2023-07-20T03:55:53Z","title":"Post-variational quantum neural networks","summary":" Quantum computing has the potential to provide substantial computational\nadvantages over current state-of-the-art classical supercomputers. However,\ncurrent hardware is not advanced enough to execute fault-tolerant quantum\nalgorithms. An alternative of using hybrid quantum-classical computing with\nvariational algorithms can exhibit barren plateau issues, causing slow\nconvergence of gradient-based optimization techniques. In this paper, we\ndiscuss \"post-variational strategies\", which shift tunable parameters from the\nquantum computer to the classical computer, opting for ensemble strategies when\noptimizing quantum models. We discuss various strategies and design principles\nfor constructing individual quantum circuits, where the resulting ensembles can\nbe optimized with convex programming. Further, we discuss architectural designs\nof post-variational quantum neural networks and analyze the propagation of\nestimation errors throughout such neural networks. Lastly, we show that our\nalgorithm can be applied to real-world applications such as image\nclassification on handwritten digits, producing a 96% classification accuracy.\n","authors":["Po-Wei Huang","Patrick Rebentrost"],"pdf_url":"https://arxiv.org/pdf/2307.10560v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10559v1","updated":"2023-07-20T03:54:47Z","published":"2023-07-20T03:54:47Z","title":"Air Traffic Controller Workload Level Prediction using Conformalized\n Dynamical Graph Learning","summary":" Air traffic control (ATC) is a safety-critical service system that demands\nconstant attention from ground air traffic controllers (ATCos) to maintain\ndaily aviation operations. The workload of the ATCos can have negative effects\non operational safety and airspace usage. To avoid overloading and ensure an\nacceptable workload level for the ATCos, it is important to predict the ATCos'\nworkload accurately for mitigation actions. In this paper, we first perform a\nreview of research on ATCo workload, mostly from the air traffic perspective.\nThen, we briefly introduce the setup of the human-in-the-loop (HITL)\nsimulations with retired ATCos, where the air traffic data and workload labels\nare obtained. The simulations are conducted under three Phoenix approach\nscenarios while the human ATCos are requested to self-evaluate their workload\nratings (i.e., low-1 to high-7). Preliminary data analysis is conducted. Next,\nwe propose a graph-based deep-learning framework with conformal prediction to\nidentify the ATCo workload levels. The number of aircraft under the\ncontroller's control varies both spatially and temporally, resulting in\ndynamically evolving graphs. The experiment results suggest that (a) besides\nthe traffic density feature, the traffic conflict feature contributes to the\nworkload prediction capabilities (i.e., minimum horizontal/vertical separation\ndistance); (b) directly learning from the spatiotemporal graph layout of\nairspace with graph neural network can achieve higher prediction accuracy,\ncompare to hand-crafted traffic complexity features; (c) conformal prediction\nis a valuable tool to further boost model prediction accuracy, resulting a\nrange of predicted workload labels. The code used is available at\n\\href{https://github.com/ymlasu/para-atm-collection/blob/master/air-traffic-prediction/ATC-Workload-Prediction/}{$\\mathsf{Link}$}.\n","authors":["Yutian Pang","Jueming Hu","Christopher S. Lieber","Nancy J. Cooke","Yongming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.10559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10550v1","updated":"2023-07-20T03:28:06Z","published":"2023-07-20T03:28:06Z","title":"SC VALL-E: Style-Controllable Zero-Shot Text to Speech Synthesizer","summary":" Expressive speech synthesis models are trained by adding corpora with diverse\nspeakers, various emotions, and different speaking styles to the dataset, in\norder to control various characteristics of speech and generate the desired\nvoice. In this paper, we propose a style control (SC) VALL-E model based on the\nneural codec language model (called VALL-E), which follows the structure of the\ngenerative pretrained transformer 3 (GPT-3). The proposed SC VALL-E takes input\nfrom text sentences and prompt audio and is designed to generate controllable\nspeech by not simply mimicking the characteristics of the prompt audio but by\ncontrolling the attributes to produce diverse voices. We identify tokens in the\nstyle embedding matrix of the newly designed style network that represent\nattributes such as emotion, speaking rate, pitch, and voice intensity, and\ndesign a model that can control these attributes. To evaluate the performance\nof SC VALL-E, we conduct comparative experiments with three representative\nexpressive speech synthesis models: global style token (GST) Tacotron2,\nvariational autoencoder (VAE) Tacotron2, and original VALL-E. We measure word\nerror rate (WER), F0 voiced error (FVE), and F0 gross pitch error (F0GPE) as\nevaluation metrics to assess the accuracy of generated sentences. For comparing\nthe quality of synthesized speech, we measure comparative mean option score\n(CMOS) and similarity mean option score (SMOS). To evaluate the style control\nability of the generated speech, we observe the changes in F0 and\nmel-spectrogram by modifying the trained tokens. When using prompt audio that\nis not present in the training data, SC VALL-E generates a variety of\nexpressive sounds and demonstrates competitive performance compared to the\nexisting models. Our implementation, pretrained models, and audio samples are\nlocated on GitHub.\n","authors":["Daegyeom Kim","Seongho Hong","Yong-Hoon Choi"],"pdf_url":"https://arxiv.org/pdf/2307.10550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08122v2","updated":"2023-07-20T03:07:28Z","published":"2023-07-16T18:31:25Z","title":"Tangent Transformers for Composition, Privacy and Removal","summary":" We introduce Tangent Attention Fine-Tuning (TAFT), a method for fine-tuning\nlinearized transformers obtained by computing a First-order Taylor Expansion\naround a pre-trained initialization. We show that the Jacobian-Vector Product\nresulting from linearization can be computed efficiently in a single forward\npass, reducing training and inference cost to the same order of magnitude as\nits original non-linear counterpart, while using the same number of parameters.\nFurthermore, we show that, when applied to various downstream visual\nclassification tasks, the resulting Tangent Transformer fine-tuned with TAFT\ncan perform comparably with fine-tuning the original non-linear network. Since\nTangent Transformers are linear with respect to the new set of weights, and the\nresulting fine-tuning loss is convex, we show that TAFT enjoys several\nadvantages compared to non-linear fine-tuning when it comes to model\ncomposition, parallel training, machine unlearning, and differential privacy.\n","authors":["Tian Yu Liu","Aditya Golatkar","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2307.08122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03718v4","updated":"2023-07-20T03:06:50Z","published":"2023-06-06T14:28:57Z","title":"Emotion-Conditioned Melody Harmonization with Hierarchical Variational\n Autoencoder","summary":" Existing melody harmonization models have made great progress in improving\nthe quality of generated harmonies, but most of them ignored the emotions\nbeneath the music. Meanwhile, the variability of harmonies generated by\nprevious methods is insufficient. To solve these problems, we propose a novel\nLSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the\ninfluence of emotional conditions on melody harmonization, while improving the\nquality of generated harmonies and capturing the abundant variability of chord\nprogressions. Specifically, LHVAE incorporates latent variables and emotional\nconditions at different levels (piece- and bar-level) to model the global and\nlocal music properties. Additionally, we introduce an attention-based melody\ncontext vector at each step to better learn the correspondence between melodies\nand harmonies. Objective experimental results show that our proposed model\noutperforms other LSTM-based models. Through subjective evaluation, we conclude\nthat only altering the types of chords hardly changes the overall emotion of\nthe music. The qualitative analysis demonstrates the ability of our model to\ngenerate variable harmonies.\n","authors":["Shulei Ji","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.03718v4.pdf","comment":"Accepted by IEEE SMC 2023"},{"id":"http://arxiv.org/abs/2212.12658v2","updated":"2023-07-20T03:00:05Z","published":"2022-12-24T05:25:09Z","title":"Improving Uncertainty Quantification of Variance Networks by\n Tree-Structured Learning","summary":" To improve the uncertainty quantification of variance networks, we propose a\nnovel tree-structured local neural network model that partitions the feature\nspace into multiple regions based on uncertainty heterogeneity. A tree is built\nupon giving the training data, whose leaf nodes represent different regions\nwhere region-specific neural networks are trained to predict both the mean and\nthe variance for quantifying uncertainty. The proposed Uncertainty-Splitting\nNeural Regression Tree (USNRT) employs novel splitting criteria. At each node,\na neural network is trained on the full data first, and a statistical test for\nthe residuals is conducted to find the best split, corresponding to the two\nsub-regions with the most significant uncertainty heterogeneity between them.\nUSNRT is computationally friendly because very few leaf nodes are sufficient\nand pruning is unnecessary. Furthermore, an ensemble version can be easily\nconstructed to estimate the total uncertainty including the aleatory and\nepistemic. On extensive UCI datasets, USNRT or its ensemble shows superior\nperformance compared to some recent popular methods for quantifying uncertainty\nwith variances. Through comprehensive visualization and analysis, we uncover\nhow USNRT works and show its merits, revealing that uncertainty heterogeneity\ndoes exist in many datasets and can be learned by USNRT.\n","authors":["Wenxuan Ma","Xing Yan","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.12658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09767v2","updated":"2023-07-20T02:51:15Z","published":"2023-03-17T04:18:03Z","title":"It Is All About Data: A Survey on the Effects of Data on Adversarial\n Robustness","summary":" Adversarial examples are inputs to machine learning models that an attacker\nhas intentionally designed to confuse the model into making a mistake. Such\nexamples pose a serious threat to the applicability of machine-learning-based\nsystems, especially in life- and safety-critical domains. To address this\nproblem, the area of adversarial robustness investigates mechanisms behind\nadversarial attacks and defenses against these attacks. This survey reviews a\nparticular subset of this literature that focuses on investigating properties\nof training data in the context of model robustness under evasion attacks. It\nfirst summarizes the main properties of data leading to adversarial\nvulnerability. It then discusses guidelines and techniques for improving\nadversarial robustness by enhancing the data representation and learning\nprocedures, as well as techniques for estimating robustness guarantees given\nparticular data. Finally, it discusses gaps of knowledge and promising future\nresearch directions in this area.\n","authors":["Peiyu Xiong","Michael Tegegn","Jaskeerat Singh Sarin","Shubhraneel Pal","Julia Rubin"],"pdf_url":"https://arxiv.org/pdf/2303.09767v2.pdf","comment":"51 pages, 25 figures, under review"},{"id":"http://arxiv.org/abs/2304.10159v2","updated":"2023-07-20T02:49:49Z","published":"2023-04-20T08:32:58Z","title":"Deep-Q Learning with Hybrid Quantum Neural Network on Solving Maze\n Problems","summary":" Quantum computing holds great potential for advancing the limitations of\nmachine learning algorithms to handle higher data dimensions and reduce overall\ntraining parameters in deep neural network (DNN) models. This study uses a\nparameterized quantum circuit (PQC) on a gate-based quantum computer to\ninvestigate the potential for quantum advantage in a model-free reinforcement\nlearning problem. Through a comprehensive investigation and evaluation of the\ncurrent model and capabilities of quantum computers, we designed and trained a\nnovel hybrid Quantum neural network based on the latest Qiskit and PyTorch\nframework. We compared its performance with a full-classical DNN with and\nwithout an integrated PQC. Our research provides insights into the potential of\ndeep quantum learning to solve a maze problem and, potentially, other\nreinforcement learning problems. We conclude that various reinforcement\nlearning problems can be effective with reasonable training epochs. Moreover, a\ncomparative discussion of the various quantum reinforcement learning model on\nmaze problems is discussed to evaluate our research's overall potential and\nadvantages.\n","authors":["Hao-Yuan Chen","Yen-Jui Chang","Ching-Ray Chang"],"pdf_url":"https://arxiv.org/pdf/2304.10159v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10541v1","updated":"2023-07-20T02:42:23Z","published":"2023-07-20T02:42:23Z","title":"Differentially Flat Learning-based Model Predictive Control Using a\n Stability, State, and Input Constraining Safety Filter","summary":" Learning-based optimal control algorithms control unknown systems using past\ntrajectory data and a learned model of the system dynamics. These controllers\nuse either a linear approximation of the learned dynamics, trading performance\nfor faster computation, or nonlinear optimization methods, which typically\nperform better but can limit real-time applicability. In this work, we present\na novel nonlinear controller that exploits differential flatness to achieve\nsimilar performance to state-of-the-art learning-based controllers but with\nsignificantly less computational effort. Differential flatness is a property of\ndynamical systems whereby nonlinear systems can be exactly linearized through a\nnonlinear input mapping. Here, the nonlinear transformation is learned as a\nGaussian process and is used in a safety filter that guarantees, with high\nprobability, stability as well as input and flat state constraint satisfaction.\nThis safety filter is then used to refine inputs from a flat model predictive\ncontroller to perform constrained nonlinear learning-based optimal control\nthrough two successive convex optimizations. We compare our method to\nstate-of-the-art learning-based control strategies and achieve similar\nperformance, but with significantly better computational efficiency, while also\nrespecting flat state and input constraints, and guaranteeing stability.\n","authors":["Adam W. Hall","Melissa Greeff","Angela P. Schoellig"],"pdf_url":"https://arxiv.org/pdf/2307.10541v1.pdf","comment":"6 pages, 5 figures, Published in IEEE Control Systems Letters"},{"id":"http://arxiv.org/abs/2307.10529v1","updated":"2023-07-20T02:07:20Z","published":"2023-07-20T02:07:20Z","title":"Fast Unsupervised Deep Outlier Model Selection with Hypernetworks","summary":" Outlier detection (OD) finds many applications with a rich literature of\nnumerous techniques. Deep neural network based OD (DOD) has seen a recent surge\nof attention thanks to the many advances in deep learning. In this paper, we\nconsider a critical-yet-understudied challenge with unsupervised DOD, that is,\neffective hyperparameter (HP) tuning/model selection. While several prior work\nreport the sensitivity of OD models to HPs, it becomes ever so critical for the\nmodern DOD models that exhibit a long list of HPs. We introduce HYPER for\ntuning DOD models, tackling two fundamental challenges: (1) validation without\nsupervision (due to lack of labeled anomalies), and (2) efficient search of the\nHP/model space (due to exponential growth in the number of HPs). A key idea is\nto design and train a novel hypernetwork (HN) that maps HPs onto optimal\nweights of the main DOD model. In turn, HYPER capitalizes on a single HN that\ncan dynamically generate weights for many DOD models (corresponding to varying\nHPs), which offers significant speed-up. In addition, it employs meta-learning\non historical OD tasks with labels to train a proxy validation function,\nlikewise trained with our proposed HN efficiently. Extensive experiments on 35\nOD tasks show that HYPER achieves high performance against 8 baselines with\nsignificant efficiency gains.\n","authors":["Xueying Ding","Yue Zhao","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2307.10529v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.10524v1","updated":"2023-07-20T01:56:10Z","published":"2023-07-20T01:56:10Z","title":"Beyond Black-Box Advice: Learning-Augmented Algorithms for MDPs with\n Q-Value Predictions","summary":" We study the tradeoff between consistency and robustness in the context of a\nsingle-trajectory time-varying Markov Decision Process (MDP) with untrusted\nmachine-learned advice. Our work departs from the typical approach of treating\nadvice as coming from black-box sources by instead considering a setting where\nadditional information about how the advice is generated is available. We prove\na first-of-its-kind consistency and robustness tradeoff given Q-value advice\nunder a general MDP model that includes both continuous and discrete\nstate/action spaces. Our results highlight that utilizing Q-value advice\nenables dynamic pursuit of the better of machine-learned advice and a robust\nbaseline, thus result in near-optimal performance guarantees, which provably\nimproves what can be obtained solely with black-box advice.\n","authors":["Tongxin Li","Yiheng Lin","Shaolei Ren","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2307.10524v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2302.10980v3","updated":"2023-07-20T01:34:16Z","published":"2023-02-21T20:26:39Z","title":"MultiRobustBench: Benchmarking Robustness Against Multiple Attacks","summary":" The bulk of existing research in defending against adversarial examples\nfocuses on defending against a single (typically bounded Lp-norm) attack, but\nfor a practical setting, machine learning (ML) models should be robust to a\nwide variety of attacks. In this paper, we present the first unified framework\nfor considering multiple attacks against ML models. Our framework is able to\nmodel different levels of learner's knowledge about the test-time adversary,\nallowing us to model robustness against unforeseen attacks and robustness\nagainst unions of attacks. Using our framework, we present the first\nleaderboard, MultiRobustBench, for benchmarking multiattack evaluation which\ncaptures performance across attack types and attack strengths. We evaluate the\nperformance of 16 defended models for robustness against a set of 9 different\nattack types, including Lp-based threat models, spatial transformations, and\ncolor changes, at 20 different attack strengths (180 attacks total).\nAdditionally, we analyze the state of current defenses against multiple\nattacks. Our analysis shows that while existing defenses have made progress in\nterms of average robustness across the set of attacks used, robustness against\nthe worst-case attack is still a big open problem as all existing models\nperform worse than random guessing.\n","authors":["Sihui Dai","Saeed Mahloujifar","Chong Xiang","Vikash Sehwag","Pin-Yu Chen","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2302.10980v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2305.11408v2","updated":"2023-07-20T00:58:30Z","published":"2023-05-19T03:31:42Z","title":"AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide\n for Simultaneous Speech Translation","summary":" Attention is the core mechanism of today's most used architectures for\nnatural language processing and has been analyzed from many perspectives,\nincluding its effectiveness for machine translation-related tasks. Among these\nstudies, attention resulted to be a useful source of information to get\ninsights about word alignment also when the input text is substituted with\naudio segments, as in the case of the speech translation (ST) task. In this\npaper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that\nexploits the attention information to generate source-target alignments that\nguide the model during inference. Through experiments on the 8 language pairs\nof MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art\nSimulST policies applied to offline-trained models with gains in terms of BLEU\nof 2 points and latency reductions ranging from 0.5s to 0.8s across the 8\nlanguages.\n","authors":["Sara Papi","Marco Turchi","Matteo Negri"],"pdf_url":"https://arxiv.org/pdf/2305.11408v2.pdf","comment":"Accepted at Interspeech 2023"},{"id":"http://arxiv.org/abs/2307.04603v4","updated":"2023-07-20T00:49:13Z","published":"2023-07-07T09:01:42Z","title":"Solvent: A Framework for Protein Folding","summary":" Consistency and reliability are crucial for conducting AI research. Many\nfamous research fields, such as object detection, have been compared and\nvalidated with solid benchmark frameworks. After AlphaFold2, the protein\nfolding task has entered a new phase, and many methods are proposed based on\nthe component of AlphaFold2. The importance of a unified research framework in\nprotein folding contains implementations and benchmarks to consistently and\nfairly compare various approaches. To achieve this, we present Solvent, an\nprotein folding framework that supports significant components of\nstate-of-the-art models in the manner of off-the-shelf interface Solvent\ncontains different models implemented in a unified codebase and supports\ntraining and evaluation for defined models on the same dataset. We benchmark\nwell-known algorithms and their components and provide experiments that give\nhelpful insights into the protein structure modeling field. We hope that\nSolvent will increase the reliability and consistency of proposed models and\ngives efficiency in both speed and costs, resulting in acceleration on protein\nfolding modeling research. The code is available at\nhttps://github.com/kakaobrain/solvent, and the project will continue to be\ndeveloped.\n","authors":["Jaemyung Lee","Kyeongtak Han","Jaehoon Kim","Hasun Yu","Youhan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.04603v4.pdf","comment":"preprint, 8pages"},{"id":"http://arxiv.org/abs/2307.09702v2","updated":"2023-07-20T00:40:41Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for Large Language Models","summary":" In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10507v1","updated":"2023-07-20T00:07:29Z","published":"2023-07-20T00:07:29Z","title":"FedSoup: Improving Generalization and Personalization in Federated\n Learning via Selective Model Interpolation","summary":" Cross-silo federated learning (FL) enables the development of machine\nlearning models on datasets distributed across data centers such as hospitals\nand clinical research laboratories. However, recent research has found that\ncurrent FL algorithms face a trade-off between local and global performance\nwhen confronted with distribution shifts. Specifically, personalized FL methods\nhave a tendency to overfit to local data, leading to a sharp valley in the\nlocal model and inhibiting its ability to generalize to out-of-distribution\ndata. In this paper, we propose a novel federated model soup method (i.e.,\nselective interpolation of model parameters) to optimize the trade-off between\nlocal and global performance. Specifically, during the federated training\nphase, each client maintains its own global model pool by monitoring the\nperformance of the interpolated model between the local and global models. This\nallows us to alleviate overfitting and seek flat minima, which can\nsignificantly improve the model's generalization performance. We evaluate our\nmethod on retinal and pathological image classification tasks, and our proposed\nmethod achieves significant improvements for out-of-distribution\ngeneralization. Our code is available at https://github.com/ubc-tea/FedSoup.\n","authors":["Minghui Chen","Meirui Jiang","Qi Dou","Zehua Wang","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2307.10507v1.pdf","comment":"Accepted by MICCAI2023"},{"id":"http://arxiv.org/abs/2307.10504v1","updated":"2023-07-20T00:02:24Z","published":"2023-07-20T00:02:24Z","title":"Identifying Interpretable Subspaces in Image Representations","summary":" We propose Automatic Feature Explanation using Contrasting Concepts (FALCON),\nan interpretability framework to explain features of image representations. For\na target feature, FALCON captions its highly activating cropped images using a\nlarge captioning dataset (like LAION-400m) and a pre-trained vision-language\nmodel like CLIP. Each word among the captions is scored and ranked leading to a\nsmall number of shared, human-understandable concepts that closely describe the\ntarget feature. FALCON also applies contrastive interpretation using lowly\nactivating (counterfactual) images, to eliminate spurious concepts. Although\nmany existing approaches interpret features independently, we observe in\nstate-of-the-art self-supervised and supervised models, that less than 20% of\nthe representation space can be explained by individual features. We show that\nfeatures in larger spaces become more interpretable when studied in groups and\ncan be explained with high-order scoring concepts through FALCON. We discuss\nhow extracted concepts can be used to explain and debug failures in downstream\ntasks. Finally, we present a technique to transfer concepts from one\n(explainable) representation space to another unseen representation space by\nlearning a simple linear transformation.\n","authors":["Neha Kalibhat","Shweta Bhardwaj","Bayan Bruss","Hamed Firooz","Maziar Sanjabi","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2307.10504v1.pdf","comment":"Published at ICML 2023"},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer: Gated - Long, Short Sequence Transformer for Step Recognition\n in Surgical Videos","summary":" Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11018v1","updated":"2023-07-20T16:45:22Z","published":"2023-07-20T16:45:22Z","title":"Amortized Variational Inference: When and Why?","summary":" Amortized variational inference (A-VI) is a method for approximating the\nintractable posterior distributions that arise in probabilistic models. The\ndefining feature of A-VI is that it learns a global inference function that\nmaps each observation to its local latent variable's approximate posterior.\nThis stands in contrast to the more classical factorized (or mean-field)\nvariational inference (F-VI), which directly learns the parameters of the\napproximating distribution for each latent variable. In deep generative models,\nA-VI is used as a computational trick to speed up inference for local latent\nvariables. In this paper, we study A-VI as a general alternative to F-VI for\napproximate posterior inference. A-VI cannot produce an approximation with a\nlower Kullback-Leibler divergence than F-VI's optimal solution, because the\namortized family is a subset of the factorized family. Thus a central\ntheoretical problem is to characterize when A-VI still attains F-VI's optimal\nsolution. We derive conditions on both the model and the inference function\nunder which A-VI can theoretically achieve F-VI's optimum. We show that for a\nbroad class of hierarchical models, including deep generative models, it is\npossible to close the gap between A-VI and F-VI. Further, for an even broader\nclass of models, we establish when and how to expand the domain of the\ninference function to make amortization a feasible strategy. Finally, we prove\nthat for certain models -- including hidden Markov models and Gaussian\nprocesses -- A-VI cannot match F-VI's solution, no matter how expressive the\ninference function is. We also study A-VI empirically. On several examples, we\ncorroborate our theoretical results and investigate the performance of A-VI\nwhen varying the complexity of the inference function. When the gap between\nA-VI and F-VI can be closed, we find that the required complexity of the\nfunction need not scale with the number of observations, and that A-VI often\nconverges faster than F-VI.\n","authors":["Charles C. Margossian","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.11018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18451v3","updated":"2023-07-20T23:59:38Z","published":"2023-05-29T04:02:10Z","title":"Shift-Robust Molecular Relational Learning with Causal Substructure","summary":" Recently, molecular relational learning, whose goal is to predict the\ninteraction behavior between molecular pairs, got a surge of interest in\nmolecular sciences due to its wide range of applications. In this work, we\npropose CMRL that is robust to the distributional shift in molecular relational\nlearning by detecting the core substructure that is causally related to\nchemical reactions. To do so, we first assume a causal relationship based on\nthe domain knowledge of molecular sciences and construct a structural causal\nmodel (SCM) that reveals the relationship between variables. Based on the SCM,\nwe introduce a novel conditional intervention framework whose intervention is\nconditioned on the paired molecule. With the conditional intervention\nframework, our model successfully learns from the causal substructure and\nalleviates the confounding effect of shortcut substructures that are spuriously\ncorrelated to chemical reactions. Extensive experiments on various tasks with\nreal-world and synthetic datasets demonstrate the superiority of CMRL over\nstate-of-the-art baseline models. Our code is available at\nhttps://github.com/Namkyeong/CMRL.\n","authors":["Namkyeong Lee","Kanghoon Yoon","Gyoung S. Na","Sein Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2305.18451v3.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2307.08167v2","updated":"2023-07-20T23:08:11Z","published":"2023-07-16T22:35:52Z","title":"Computing the gradients with respect to all parameters of a quantum\n neural network using a single circuit","summary":" When computing the gradients of a quantum neural network using the\nparameter-shift rule, the cost function needs to be calculated twice for the\ngradient with respect to a single adjustable parameter of the network. When the\ntotal number of parameters is high, the quantum circuit for the computation has\nto be adjusted and run for many times. Here we propose an approach to compute\nall the gradients using a single circuit only, with a much reduced circuit\ndepth and less classical registers. We also demonstrate experimentally, on both\nreal quantum hardware and simulator, that our approach has the advantages that\nthe circuit takes a significantly shorter time to compile than the conventional\napproach, resulting in a speedup on the total runtime.\n","authors":["Guang Ping He"],"pdf_url":"https://arxiv.org/pdf/2307.08167v2.pdf","comment":"Added a suggestion on improving real quantum computers"},{"id":"http://arxiv.org/abs/2307.11249v1","updated":"2023-07-20T21:49:38Z","published":"2023-07-20T21:49:38Z","title":"On the Fisher-Rao Gradient of the Evidence Lower Bound","summary":" This article studies the Fisher-Rao gradient, also referred to as the natural\ngradient, of the evidence lower bound, the ELBO, which plays a crucial role\nwithin the theory of the Variational Autonecoder, the Helmholtz Machine and the\nFree Energy Principle. The natural gradient of the ELBO is related to the\nnatural gradient of the Kullback-Leibler divergence from a target distribution,\nthe prime objective function of learning. Based on invariance properties of\ngradients within information geometry, conditions on the underlying model are\nprovided that ensure the equivalence of minimising the prime objective function\nand the maximisation of the ELBO.\n","authors":["Nihat Ay","Jesse van Oostrum"],"pdf_url":"https://arxiv.org/pdf/2307.11249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11242v1","updated":"2023-07-20T21:25:25Z","published":"2023-07-20T21:25:25Z","title":"On-Sensor Data Filtering using Neuromorphic Computing for High Energy\n Physics Experiments","summary":" This work describes the investigation of neuromorphic computing-based spiking\nneural network (SNN) models used to filter data from sensor electronics in high\nenergy physics experiments conducted at the High Luminosity Large Hadron\nCollider. We present our approach for developing a compact neuromorphic model\nthat filters out the sensor data based on the particle's transverse momentum\nwith the goal of reducing the amount of data being sent to the downstream\nelectronics. The incoming charge waveforms are converted to streams of\nbinary-valued events, which are then processed by the SNN. We present our\ninsights on the various system design choices - from data encoding to optimal\nhyperparameters of the training algorithm - for an accurate and compact SNN\noptimized for hardware deployment. Our results show that an SNN trained with an\nevolutionary algorithm and an optimized set of hyperparameters obtains a signal\nefficiency of about 91% with nearly half as many parameters as a deep neural\nnetwork.\n","authors":["Shruti R. Kulkarni","Aaron Young","Prasanna Date","Narasinga Rao Miniskar","Jeffrey S. Vetter","Farah Fahim","Benjamin Parpillon","Jennet Dickinson","Nhan Tran","Jieun Yoo","Corrinne Mills","Morris Swartz","Petar Maksimovic","Catherine D. Schuman","Alice Bean"],"pdf_url":"https://arxiv.org/pdf/2307.11242v1.pdf","comment":"Manuscript accepted at ICONS'23"},{"id":"http://arxiv.org/abs/2307.11239v1","updated":"2023-07-20T21:22:02Z","published":"2023-07-20T21:22:02Z","title":"Edgewise outliers of network indexed signals","summary":" We consider models for network indexed multivariate data involving a\ndependence between variables as well as across graph nodes.\n In the framework of these models, we focus on outliers detection and\nintroduce the concept of edgewise outliers. For this purpose, we first derive\nthe distribution of some sums of squares, in particular squared Mahalanobis\ndistances that can be used to fix detection rules and thresholds for outlier\ndetection. We then propose a robust version of the deterministic MCD algorithm\nthat we call edgewise MCD. An application on simulated data shows the interest\nof taking the dependence structure into account. We also illustrate the utility\nof the proposed method with a real data set.\n","authors":["Christopher Rieser","Anne Ruiz-Gazen","Christine Thomas-Agnan"],"pdf_url":"https://arxiv.org/pdf/2307.11239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11234v1","updated":"2023-07-20T21:10:54Z","published":"2023-07-20T21:10:54Z","title":"QDC: Quantum Diffusion Convolution Kernels on Graphs","summary":" Graph convolutional neural networks (GCNs) operate by aggregating messages\nover local neighborhoods given the prediction task under interest. Many GCNs\ncan be understood as a form of generalized diffusion of input features on the\ngraph, and significant work has been dedicated to improving predictive accuracy\nby altering the ways of message passing. In this work, we propose a new\nconvolution kernel that effectively rewires the graph according to the\noccupation correlations of the vertices by trading on the generalized diffusion\nparadigm for the propagation of a quantum particle over the graph. We term this\nnew convolution kernel the Quantum Diffusion Convolution (QDC) operator. In\naddition, we introduce a multiscale variant that combines messages from the QDC\noperator and the traditional combinatorial Laplacian. To understand our method,\nwe explore the spectral dependence of homophily and the importance of quantum\ndynamics in the construction of a bandpass filter. Through these studies, as\nwell as experiments on a range of datasets, we observe that QDC improves\npredictive performance on the widely used benchmark datasets when compared to\nsimilar methods.\n","authors":["Thomas Markovich"],"pdf_url":"https://arxiv.org/pdf/2307.11234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13807v2","updated":"2023-07-20T20:57:08Z","published":"2023-01-31T17:50:52Z","title":"Identifying the Hazard Boundary of ML-enabled Autonomous Systems Using\n Cooperative Co-Evolutionary Search","summary":" In Machine Learning (ML)-enabled autonomous systems (MLASs), it is essential\nto identify the hazard boundary of ML Components (MLCs) in the MLAS under\nanalysis. Given that such boundary captures the conditions in terms of MLC\nbehavior and system context that can lead to hazards, it can then be used to,\nfor example, build a safety monitor that can take any predefined fallback\nmechanisms at runtime when reaching the hazard boundary. However, determining\nsuch hazard boundary for an ML component is challenging. This is due to the\nproblem space combining system contexts (i.e., scenarios) and MLC behaviors\n(i.e., inputs and outputs) being far too large for exhaustive exploration and\neven to handle using conventional metaheuristics, such as genetic algorithms.\nAdditionally, the high computational cost of simulations required to determine\nany MLAS safety violations makes the problem even more challenging.\nFurthermore, it is unrealistic to consider a region in the problem space\ndeterministically safe or unsafe due to the uncontrollable parameters in\nsimulations and the non-linear behaviors of ML models (e.g., deep neural\nnetworks) in the MLAS under analysis. To address the challenges, we propose\nMLCSHE (ML Component Safety Hazard Envelope), a novel method based on a\nCooperative Co-Evolutionary Algorithm (CCEA), which aims to tackle a\nhigh-dimensional problem by decomposing it into two lower-dimensional search\nsubproblems. Moreover, we take a probabilistic view of safe and unsafe regions\nand define a novel fitness function to measure the distance from the\nprobabilistic hazard boundary and thus drive the search effectively. We\nevaluate the effectiveness and efficiency of MLCSHE on a complex Autonomous\nVehicle (AV) case study. Our evaluation results show that MLCSHE is\nsignificantly more effective and efficient compared to a standard genetic\nalgorithm and random search.\n","authors":["Sepehr Sharifi","Donghwan Shin","Lionel C. Briand","Nathan Aschbacher"],"pdf_url":"https://arxiv.org/pdf/2301.13807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11228v1","updated":"2023-07-20T20:46:39Z","published":"2023-07-20T20:46:39Z","title":"From Adaptive Query Release to Machine Unlearning","summary":" We formalize the problem of machine unlearning as design of efficient\nunlearning algorithms corresponding to learning algorithms which perform a\nselection of adaptive queries from structured query classes. We give efficient\nunlearning algorithms for linear and prefix-sum query classes. As applications,\nwe show that unlearning in many problems, in particular, stochastic convex\noptimization (SCO), can be reduced to the above, yielding improved guarantees\nfor the problem. In particular, for smooth Lipschitz losses and any $\\rho>0$,\nour results yield an unlearning algorithm with excess population risk of\n$\\tilde O\\big(\\frac{1}{\\sqrt{n}}+\\frac{\\sqrt{d}}{n\\rho}\\big)$ with unlearning\nquery (gradient) complexity $\\tilde O(\\rho \\cdot \\text{Retraining\nComplexity})$, where $d$ is the model dimensionality and $n$ is the initial\nnumber of samples. For non-smooth Lipschitz losses, we give an unlearning\nalgorithm with excess population risk $\\tilde\nO\\big(\\frac{1}{\\sqrt{n}}+\\big(\\frac{\\sqrt{d}}{n\\rho}\\big)^{1/2}\\big)$ with the\nsame unlearning query (gradient) complexity. Furthermore, in the special case\nof Generalized Linear Models (GLMs), such as those in linear and logistic\nregression, we get dimension-independent rates of $\\tilde\nO\\big(\\frac{1}{\\sqrt{n}} +\\frac{1}{(n\\rho)^{2/3}}\\big)$ and $\\tilde\nO\\big(\\frac{1}{\\sqrt{n}} +\\frac{1}{(n\\rho)^{1/3}}\\big)$ for smooth Lipschitz\nand non-smooth Lipschitz losses respectively. Finally, we give generalizations\nof the above from one unlearning request to \\textit{dynamic} streams consisting\nof insertions and deletions.\n","authors":["Enayat Ullah","Raman Arora"],"pdf_url":"https://arxiv.org/pdf/2307.11228v1.pdf","comment":"Accepted to ICML 2023"},{"id":"http://arxiv.org/abs/2307.11224v1","updated":"2023-07-20T20:37:24Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n Models","summary":" Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. While\nthese models are not exclusively designed for text generation, they excel in\napplications such as dense retrieval and semantic textual similarity. This\npaper details the development of Jina Embeddings, starting with the creation of\na high-quality pairwise and triplet dataset. It underlines the crucial role of\ndata cleaning in dataset preparation, gives in-depth insights into the model\ntraining process, and concludes with a comprehensive performance evaluation\nusing the Massive Textual Embedding Benchmark (MTEB).\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v1.pdf","comment":"9 pages, 2 page appendix, EMNLP 2023 Industrial Track"},{"id":"http://arxiv.org/abs/2307.11214v1","updated":"2023-07-20T19:56:30Z","published":"2023-07-20T19:56:30Z","title":"FairMobi-Net: A Fairness-aware Deep Learning Model for Urban Mobility\n Flow Generation","summary":" Generating realistic human flows across regions is essential for our\nunderstanding of urban structures and population activity patterns, enabling\nimportant applications in the fields of urban planning and management. However,\na notable shortcoming of most existing mobility generation methodologies is\nneglect of prediction fairness, which can result in underestimation of mobility\nflows across regions with vulnerable population groups, potentially resulting\nin inequitable resource distribution and infrastructure development. To\novercome this limitation, our study presents a novel, fairness-aware deep\nlearning model, FairMobi-Net, for inter-region human flow prediction. The\nFairMobi-Net model uniquely incorporates fairness loss into the loss function\nand employs a hybrid approach, merging binary classification and numerical\nregression techniques for human flow prediction. We validate the FairMobi-Net\nmodel using comprehensive human mobility datasets from four U.S. cities,\npredicting human flow at the census-tract level. Our findings reveal that the\nFairMobi-Net model outperforms state-of-the-art models (such as the DeepGravity\nmodel) in producing more accurate and equitable human flow predictions across a\nvariety of region pairs, regardless of regional income differences. The model\nmaintains a high degree of accuracy consistently across diverse regions,\naddressing the previous fairness concern. Further analysis of feature\nimportance elucidates the impact of physical distances and road network\nstructures on human flows across regions. With fairness as its touchstone, the\nmodel and results provide researchers and practitioners across the fields of\nurban sciences, transportation engineering, and computing with an effective\ntool for accurate generation of human mobility flows across regions.\n","authors":["Zhewei Liu","Lipai Huang","Chao Fan","Ali Mostafavi"],"pdf_url":"https://arxiv.org/pdf/2307.11214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11211v1","updated":"2023-07-20T19:53:09Z","published":"2023-07-20T19:53:09Z","title":"The Effect of Epidemiological Cohort Creation on the Machine Learning\n Prediction of Homelessness and Police Interaction Outcomes Using\n Administrative Health Care Data","summary":" Background: Mental illness can lead to adverse outcomes such as homelessness\nand police interaction and understanding of the events leading up to these\nadverse outcomes is important. Predictive models may help identify individuals\nat risk of such adverse outcomes. Using a fixed observation window cohort with\nlogistic regression (LR) or machine learning (ML) models can result in lower\nperformance when compared with adaptive and parcellated windows. Method: An\nadministrative healthcare dataset was used, comprising of 240,219 individuals\nin Calgary, Alberta, Canada who were diagnosed with addiction or mental health\n(AMH) between April 1, 2013, and March 31, 2018. The cohort was followed for 2\nyears to identify factors associated with homelessness and police interactions.\nTo understand the benefit of flexible windows to predictive models, an\nalternative cohort was created. Then LR and ML models, including random forests\n(RF), and extreme gradient boosting (XGBoost) were compared in the two cohorts.\nResults: Among 237,602 individuals, 0.8% (1,800) experienced first\nhomelessness, while 0.32% (759) reported initial police interaction among\n237,141 individuals. Male sex (AORs: H=1.51, P=2.52), substance disorder (AORs:\nH=3.70, P=2.83), psychiatrist visits (AORs: H=1.44, P=1.49), and drug abuse\n(AORs: H=2.67, P=1.83) were associated with initial homelessness (H) and police\ninteraction (P). XGBoost showed superior performance using the flexible method\n(sensitivity =91%, AUC =90% for initial homelessness, and sensitivity =90%,\nAUC=89% for initial police interaction)\n Conclusion: This study identified key features associated with initial\nhomelessness and police interaction and demonstrated that flexible windows can\nimprove predictive modeling.\n","authors":["Faezehsadat Shahidi","M. Ethan MacDonald","Dallas Seitz","Geoffrey Messier"],"pdf_url":"https://arxiv.org/pdf/2307.11211v1.pdf","comment":"to be published in Frontiers in Digital Health, Health Informatics"},{"id":"http://arxiv.org/abs/2307.11209v1","updated":"2023-07-20T19:52:14Z","published":"2023-07-20T19:52:14Z","title":"Clinical Trial Active Learning","summary":" This paper presents a novel approach to active learning that takes into\naccount the non-independent and identically distributed (non-i.i.d.) structure\nof a clinical trial setting. There exists two types of clinical trials:\nretrospective and prospective. Retrospective clinical trials analyze data after\ntreatment has been performed; prospective clinical trials collect data as\ntreatment is ongoing. Typically, active learning approaches assume the dataset\nis i.i.d. when selecting training samples; however, in the case of clinical\ntrials, treatment results in a dependency between the data collected at the\ncurrent and past visits. Thus, we propose prospective active learning to\novercome the limitations present in traditional active learning methods and\napply it to disease detection in optical coherence tomography (OCT) images,\nwhere we condition on the time an image was collected to enforce the i.i.d.\nassumption. We compare our proposed method to the traditional active learning\nparadigm, which we refer to as retrospective in nature. We demonstrate that\nprospective active learning outperforms retrospective active learning in two\ndifferent types of test settings.\n","authors":["Zoe Fowler","Kiran Kokilepersaud","Mohit Prabhushankar","Ghassan AlRegib"],"pdf_url":"https://arxiv.org/pdf/2307.11209v1.pdf","comment":"Accepted at 14th ACM International Conference on Bioinformatics,\n Computational Biology and Health Informatics (ACM-BCB)"},{"id":"http://arxiv.org/abs/2307.06324v4","updated":"2023-07-20T19:51:06Z","published":"2023-07-12T17:41:07Z","title":"Provably Faster Gradient Descent via Long Steps","summary":" This work establishes provably faster convergence rates for gradient descent\nin smooth convex optimization via a computer-assisted analysis technique. Our\ntheory allows nonconstant stepsize policies with frequent long steps\npotentially violating descent by analyzing the overall effect of many\niterations at once rather than the typical one-iteration inductions used in\nmost first-order method analyses. We show that long steps, which may increase\nthe objective value in the short term, lead to provably faster convergence in\nthe long term. A conjecture towards proving a faster $O(1/T\\log T)$ rate for\ngradient descent is also motivated along with simple numerical validation.\n","authors":["Benjamin Grimmer"],"pdf_url":"https://arxiv.org/pdf/2307.06324v4.pdf","comment":"Apologies for the several updates done shortly after first posting\n this work: In these, I have added more references to excellent relevant works\n I missed in my initial literature review, esp the Master's thesis of Jason\n Altschuler"},{"id":"http://arxiv.org/abs/2210.03297v2","updated":"2023-07-20T19:28:22Z","published":"2022-10-07T03:10:34Z","title":"Preprocessors Matter! Realistic Decision-Based Attacks on Machine\n Learning Systems","summary":" Decision-based attacks construct adversarial examples against a machine\nlearning (ML) model by making only hard-label queries. These attacks have\nmainly been applied directly to standalone neural networks. However, in\npractice, ML models are just one component of a larger learning system. We find\nthat by adding a single preprocessor in front of a classifier, state-of-the-art\nquery-based attacks are up to 7$\\times$ less effective at attacking a\nprediction pipeline than at attacking the model alone. We explain this\ndiscrepancy by the fact that most preprocessors introduce some notion of\ninvariance to the input space. Hence, attacks that are unaware of this\ninvariance inevitably waste a large number of queries to re-discover or\novercome it. We, therefore, develop techniques to (i) reverse-engineer the\npreprocessor and then (ii) use this extracted information to attack the\nend-to-end system. Our preprocessors extraction method requires only a few\nhundred queries, and our preprocessor-aware attacks recover the same efficacy\nas when attacking the model alone. The code can be found at\nhttps://github.com/google-research/preprocessor-aware-black-box-attack.\n","authors":["Chawin Sitawarin","Florian Tramèr","Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2210.03297v2.pdf","comment":"ICML 2023. Code can be found at\n https://github.com/google-research/preprocessor-aware-black-box-attack"},{"id":"http://arxiv.org/abs/2307.11197v1","updated":"2023-07-20T19:20:35Z","published":"2023-07-20T19:20:35Z","title":"Heuristic Hyperparameter Choice for Image Anomaly Detection","summary":" Anomaly detection (AD) in images is a fundamental computer vision problem by\ndeep learning neural network to identify images deviating significantly from\nnormality. The deep features extracted from pretrained models have been proved\nto be essential for AD based on multivariate Gaussian distribution analysis.\nHowever, since models are usually pretrained on a large dataset for\nclassification tasks such as ImageNet, they might produce lots of redundant\nfeatures for AD, which increases computational cost and degrades the\nperformance. We aim to do the dimension reduction of Negated Principal\nComponent Analysis (NPCA) for these features. So we proposed some heuristic to\nchoose hyperparameter of NPCA algorithm for getting as fewer components of\nfeatures as possible while ensuring a good performance.\n","authors":["Zeyu Jiang","João P. C. Bertoldo","Etienne Decencière"],"pdf_url":"https://arxiv.org/pdf/2307.11197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03933v2","updated":"2023-07-20T19:12:45Z","published":"2023-06-06T18:01:03Z","title":"High-dimensional and Permutation Invariant Anomaly Detection","summary":" Methods for anomaly detection of new physics processes are often limited to\nlow-dimensional spaces due to the difficulty of learning high-dimensional\nprobability densities. Particularly at the constituent level, incorporating\ndesirable properties such as permutation invariance and variable-length inputs\nbecomes difficult within popular density estimation methods. In this work, we\nintroduce a permutation-invariant density estimator for particle physics data\nbased on diffusion models, specifically designed to handle variable-length\ninputs. We demonstrate the efficacy of our methodology by utilizing the learned\ndensity as a permutation-invariant anomaly detection score, effectively\nidentifying jets with low likelihood under the background-only hypothesis. To\nvalidate our density estimation method, we investigate the ratio of learned\ndensities and compare to those obtained by a supervised classification\nalgorithm.\n","authors":["Vinicius Mikuni","Benjamin Nachman"],"pdf_url":"https://arxiv.org/pdf/2306.03933v2.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2212.12606v2","updated":"2023-07-20T18:58:11Z","published":"2022-12-23T22:44:25Z","title":"A Convergence Rate for Manifold Neural Networks","summary":" High-dimensional data arises in numerous applications, and the rapidly\ndeveloping field of geometric deep learning seeks to develop neural network\narchitectures to analyze such data in non-Euclidean domains, such as graphs and\nmanifolds. Recent work by Z. Wang, L. Ruiz, and A. Ribeiro has introduced a\nmethod for constructing manifold neural networks using the spectral\ndecomposition of the Laplace Beltrami operator. Moreover, in this work, the\nauthors provide a numerical scheme for implementing such neural networks when\nthe manifold is unknown and one only has access to finitely many sample points.\nThe authors show that this scheme, which relies upon building a data-driven\ngraph, converges to the continuum limit as the number of sample points tends to\ninfinity. Here, we build upon this result by establishing a rate of convergence\nthat depends on the intrinsic dimension of the manifold but is independent of\nthe ambient dimension. We also discuss how the rate of convergence depends on\nthe depth of the network and the number of filters used in each layer.\n","authors":["Joyce Chew","Deanna Needell","Michael Perlmutter"],"pdf_url":"https://arxiv.org/pdf/2212.12606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11589v2","updated":"2023-07-20T18:48:37Z","published":"2022-10-20T21:01:14Z","title":"Monotonic Risk Relationships under Distribution Shifts for Regularized\n Risk Minimization","summary":" Machine learning systems are often applied to data that is drawn from a\ndifferent distribution than the training distribution. Recent work has shown\nthat for a variety of classification and signal reconstruction problems, the\nout-of-distribution performance is strongly linearly correlated with the\nin-distribution performance. If this relationship or more generally a monotonic\none holds, it has important consequences. For example, it allows to optimize\nperformance on one distribution as a proxy for performance on the other. In\nthis paper, we study conditions under which a monotonic relationship between\nthe performances of a model on two distributions is expected. We prove an exact\nasymptotic linear relation for squared error and a monotonic relation for\nmisclassification error for ridge-regularized general linear models under\ncovariate shift, as well as an approximate linear relation for linear inverse\nproblems.\n","authors":["Daniel LeJeune","Jiayu Liu","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2210.11589v2.pdf","comment":"34 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.09782v2","updated":"2023-07-20T18:47:20Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n Using Floating-Point Formats","summary":" In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11166v1","updated":"2023-07-20T18:01:48Z","published":"2023-07-20T18:01:48Z","title":"Exploring reinforcement learning techniques for discrete and continuous\n control tasks in the MuJoCo environment","summary":" We leverage the fast physics simulator, MuJoCo to run tasks in a continuous\ncontrol environment and reveal details like the observation space, action\nspace, rewards, etc. for each task. We benchmark value-based methods for\ncontinuous control by comparing Q-learning and SARSA through a discretization\napproach, and using them as baselines, progressively moving into one of the\nstate-of-the-art deep policy gradient method DDPG. Over a large number of\nepisodes, Qlearning outscored SARSA, but DDPG outperformed both in a small\nnumber of episodes. Lastly, we also fine-tuned the model hyper-parameters\nexpecting to squeeze more performance but using lesser time and resources. We\nanticipated that the new design for DDPG would vastly improve performance, yet\nafter only a few episodes, we were able to achieve decent average rewards. We\nexpect to improve the performance provided adequate time and computational\nresources.\n","authors":["Vaddadi Sai Rahul","Debajyoti Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2307.11166v1.pdf","comment":"Released @ Dec 2021. For associated project files, see\n https://github.com/chakrabortyde/mujoco-control-tasks"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.11025v1","updated":"2023-07-20T16:53:41Z","published":"2023-07-20T16:53:41Z","title":"Investigating VTubing as a Reconstruction of Streamer Self-Presentation:\n Identity, Performance, and Gender","summary":" VTubers, or Virtual YouTubers, are live streamers who create streaming\ncontent using animated 2D or 3D virtual avatars. In recent years, there has\nbeen a significant increase in the number of VTuber creators and viewers across\nthe globe. This practise has drawn research attention into topics such as\nviewers' engagement behaviors and perceptions, however, as animated avatars\noffer more identity and performance flexibility than traditional live streaming\nwhere one uses their own body, little research has focused on how this\nflexibility influences how creators present themselves. This research thus\nseeks to fill this gap by presenting results from a qualitative study of 16\nChinese-speaking VTubers' streaming practices. The data revealed that the\nvirtual avatars that were used while live streaming afforded creators\nopportunities to present themselves using inflated presentations and resulted\nin inclusive interactions with viewers. The results also unveiled the inflated,\nand often sexualized, gender expressions of VTubers while they were situated in\nmisogynistic environments. The socio-technical facets of VTubing were found to\npotentially reduce sexual harassment and sexism, whilst also raising\nself-objectification concerns.\n","authors":["Qian Wan","Zhicong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.11025v1.pdf","comment":"Under review at ACM CSCW after a Major Revision"},{"id":"http://arxiv.org/abs/2210.05335v3","updated":"2023-07-20T16:24:14Z","published":"2022-10-11T10:54:54Z","title":"MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model","summary":" Multimodal semantic understanding often has to deal with uncertainty, which\nmeans the obtained messages tend to refer to multiple targets. Such uncertainty\nis problematic for our interpretation, including inter- and intra-modal\nuncertainty. Little effort has studied the modeling of this uncertainty,\nparticularly in pre-training on unlabeled datasets and fine-tuning in\ntask-specific downstream datasets. In this paper, we project the\nrepresentations of all modalities as probabilistic distributions via a\nProbability Distribution Encoder (PDE) by utilizing sequence-level\ninteractions. Compared to the existing deterministic methods, such uncertainty\nmodeling can convey richer multimodal semantic information and more complex\nrelationships. Furthermore, we integrate uncertainty modeling with popular\npre-training frameworks and propose suitable pre-training tasks:\nDistribution-based Vision-Language Contrastive learning (D-VLC),\nDistribution-based Masked Language Modeling (D-MLM), and Distribution-based\nImage-Text Matching (D-ITM). The fine-tuned models are applied to challenging\ndownstream tasks, including image-text retrieval, visual question answering,\nvisual reasoning, and visual entailment, and achieve state-of-the-art results.\n","authors":["Yatai Ji","Junjie Wang","Yuan Gong","Lin Zhang","Yanru Zhu","Hongfa Wang","Jiaxing Zhang","Tetsuya Sakai","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05335v3.pdf","comment":"CVPR 2023 Main Track Long Paper"},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":" Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2303.12112v3","updated":"2023-07-20T08:16:09Z","published":"2023-03-21T18:03:14Z","title":"Positive-Augmented Contrastive Learning for Image and Video Captioning\n Evaluation","summary":" The CLIP model has been recently proven to be very effective for a variety of\ncross-modal tasks, including the evaluation of captions generated from\nvision-and-language architectures. In this paper, we propose a new recipe for a\ncontrastive-based evaluation metric for image captioning, namely\nPositive-Augmented Contrastive learning Score (PAC-S), that in a novel way\nunifies the learning of a contrastive visual-semantic space with the addition\nof generated images and text on curated data. Experiments spanning several\ndatasets demonstrate that our new metric achieves the highest correlation with\nhuman judgments on both images and videos, outperforming existing\nreference-based metrics like CIDEr and SPICE and reference-free metrics like\nCLIP-Score. Finally, we test the system-level correlation of the proposed\nmetric when considering popular image captioning approaches, and assess the\nimpact of employing different cross-modal features. Our source code and trained\nmodels are publicly available at: https://github.com/aimagelab/pacscore.\n","authors":["Sara Sarto","Manuele Barraco","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2303.12112v3.pdf","comment":"CVPR 2023 (highlight paper)"},{"id":"http://arxiv.org/abs/2307.10642v1","updated":"2023-07-20T07:12:56Z","published":"2023-07-20T07:12:56Z","title":"RetouchingFFHQ: A Large-scale Dataset for Fine-grained Face Retouching\n Detection","summary":" The widespread use of face retouching filters on short-video platforms has\nraised concerns about the authenticity of digital appearances and the impact of\ndeceptive advertising. To address these issues, there is a pressing need to\ndevelop advanced face retouching techniques. However, the lack of large-scale\nand fine-grained face retouching datasets has been a major obstacle to progress\nin this field. In this paper, we introduce RetouchingFFHQ, a large-scale and\nfine-grained face retouching dataset that contains over half a million\nconditionally-retouched images. RetouchingFFHQ stands out from previous\ndatasets due to its large scale, high quality, fine-grainedness, and\ncustomization. By including four typical types of face retouching operations\nand different retouching levels, we extend the binary face retouching detection\ninto a fine-grained, multi-retouching type, and multi-retouching level\nestimation problem. Additionally, we propose a Multi-granularity Attention\nModule (MAM) as a plugin for CNN backbones for enhanced cross-scale\nrepresentation learning. Extensive experiments using different baselines as\nwell as our proposed method on RetouchingFFHQ show decent performance on face\nretouching detection. With the proposed new dataset, we believe there is great\npotential for future work to tackle the challenging problem of real-world\nfine-grained face retouching detection.\n","authors":["Qichao Ying","Jiaxin Liu","Sheng Li","Haisheng Xu","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10642v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2306.03718v4","updated":"2023-07-20T03:06:50Z","published":"2023-06-06T14:28:57Z","title":"Emotion-Conditioned Melody Harmonization with Hierarchical Variational\n Autoencoder","summary":" Existing melody harmonization models have made great progress in improving\nthe quality of generated harmonies, but most of them ignored the emotions\nbeneath the music. Meanwhile, the variability of harmonies generated by\nprevious methods is insufficient. To solve these problems, we propose a novel\nLSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the\ninfluence of emotional conditions on melody harmonization, while improving the\nquality of generated harmonies and capturing the abundant variability of chord\nprogressions. Specifically, LHVAE incorporates latent variables and emotional\nconditions at different levels (piece- and bar-level) to model the global and\nlocal music properties. Additionally, we introduce an attention-based melody\ncontext vector at each step to better learn the correspondence between melodies\nand harmonies. Objective experimental results show that our proposed model\noutperforms other LSTM-based models. Through subjective evaluation, we conclude\nthat only altering the types of chords hardly changes the overall emotion of\nthe music. The qualitative analysis demonstrates the ability of our model to\ngenerate variable harmonies.\n","authors":["Shulei Ji","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.03718v4.pdf","comment":"Accepted by IEEE SMC 2023"}]},"2023-07-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.11729v1","updated":"2023-07-21T17:40:47Z","published":"2023-07-21T17:40:47Z","title":"OUTFOX: LLM-generated Essay Detection through In-context Learning with\n Adversarially Generated Examples","summary":" Large Language Models (LLMs) have achieved human-level fluency in text\ngeneration, making it difficult to distinguish between human-written and\nLLM-generated texts. This poses a growing risk of misuse of LLMs and demands\nthe development of detectors to identify LLM-generated texts. However, existing\ndetectors degrade detection accuracy by simply paraphrasing LLM-generated\ntexts. Furthermore, the effectiveness of these detectors in real-life\nsituations, such as when students use LLMs for writing homework assignments\n(e.g., essays) and quickly learn how to evade these detectors, has not been\nexplored. In this paper, we propose OUTFOX, a novel framework that improves the\nrobustness of LLM-generated-text detectors by allowing both the detector and\nthe attacker to consider each other's output and apply this to the domain of\nstudent essays. In our framework, the attacker uses the detector's prediction\nlabels as examples for in-context learning and adversarially generates essays\nthat are harder to detect. While the detector uses the adversarially generated\nessays as examples for in-context learning to learn to detect essays from a\nstrong attacker. Our experiments show that our proposed detector learned\nin-context from the attacker improves the detection performance on the attacked\ndataset by up to +41.3 point F1-score. While our proposed attacker can\ndrastically degrade the performance of the detector by up to -57.0 point\nF1-score compared to the paraphrasing method.\n","authors":["Ryuto Koike","Masahiro Kaneko","Naoaki Okazaki"],"pdf_url":"https://arxiv.org/pdf/2307.11729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10490v2","updated":"2023-07-21T16:51:15Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n Multi-Modal LLMs","summary":" We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06576v3","updated":"2023-07-21T16:06:32Z","published":"2023-07-13T06:25:22Z","title":"Going Beyond Local: Global Graph-Enhanced Personalized News\n Recommendations","summary":" Precisely recommending candidate news articles to users has always been a\ncore challenge for personalized news recommendation systems. Most recent works\nprimarily focus on using advanced natural language processing techniques to\nextract semantic information from rich textual data, employing content-based\nmethods derived from local historical news. However, this approach lacks a\nglobal perspective, failing to account for users' hidden motivations and\nbehaviors beyond semantic information. To address this challenge, we propose a\nnovel model called GLORY (Global-LOcal news Recommendation sYstem), which\ncombines global representations learned from other users with local\nrepresentations to enhance personalized recommendation systems. We accomplish\nthis by constructing a Global-aware Historical News Encoder, which includes a\nglobal news graph and employs gated graph neural networks to enrich news\nrepresentations, thereby fusing historical news representations by a historical\nnews aggregator. Similarly, we extend this approach to a Global Candidate News\nEncoder, utilizing a global entity graph and a candidate news aggregator to\nenhance candidate news representation. Evaluation results on two public news\ndatasets demonstrate that our method outperforms existing approaches.\nFurthermore, our model offers more diverse recommendations.\n","authors":["Boming Yang","Dairui Liu","Toyotaro Suzumura","Ruihai Dong","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2307.06576v3.pdf","comment":"10 pages, Recsys 2023"},{"id":"http://arxiv.org/abs/2307.11661v1","updated":"2023-07-21T15:49:59Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":" Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. We will release the code, prompts, and auxiliary text\ndataset upon acceptance.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v1.pdf","comment":"10 pages, Pre-print"},{"id":"http://arxiv.org/abs/2307.11636v1","updated":"2023-07-21T14:58:44Z","published":"2023-07-21T14:58:44Z","title":"OxfordTVG-HIC: Can Machine Make Humorous Captions from Images?","summary":" This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale\ndataset for humour generation and understanding. Humour is an abstract,\nsubjective, and context-dependent cognitive construct involving several\ncognitive factors, making it a challenging task to generate and interpret.\nHence, humour generation and understanding can serve as a new task for\nevaluating the ability of deep-learning methods to process abstract and\nsubjective information. Due to the scarcity of data, humour-related generation\ntasks such as captioning remain under-explored. To address this gap,\nOxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to\ntrain a generalizable humour captioning model. Contrary to existing captioning\ndatasets, OxfordTVG-HIC features a wide range of emotional and semantic\ndiversity resulting in out-of-context examples that are particularly conducive\nto generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive\ncontent. We also show how OxfordTVG-HIC can be leveraged for evaluating the\nhumour of a generated text. Through explainability analysis of the trained\nmodels, we identify the visual and linguistic cues influential for evoking\nhumour prediction (and generation). We observe qualitatively that these cues\nare aligned with the benign violation theory of humour in cognitive psychology.\n","authors":["Runjia Li","Shuyang Sun","Mohamed Elhoseiny","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2307.11636v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2212.09648v4","updated":"2023-07-21T14:44:45Z","published":"2022-12-19T17:28:22Z","title":"NusaCrowd: Open Source Initiative for Indonesian NLP Resources","summary":" We present NusaCrowd, a collaborative initiative to collect and unify\nexisting resources for Indonesian languages, including opening access to\npreviously non-public resources. Through this initiative, we have brought\ntogether 137 datasets and 118 standardized data loaders. The quality of the\ndatasets has been assessed manually and automatically, and their value is\ndemonstrated through multiple experiments. NusaCrowd's data collection enables\nthe creation of the first zero-shot benchmarks for natural language\nunderstanding and generation in Indonesian and the local languages of\nIndonesia. Furthermore, NusaCrowd brings the creation of the first multilingual\nautomatic speech recognition benchmark in Indonesian and the local languages of\nIndonesia. Our work strives to advance natural language processing (NLP)\nresearch for languages that are under-represented despite being widely spoken.\n","authors":["Samuel Cahyawijaya","Holy Lovenia","Alham Fikri Aji","Genta Indra Winata","Bryan Wilie","Rahmad Mahendra","Christian Wibisono","Ade Romadhony","Karissa Vincentio","Fajri Koto","Jennifer Santoso","David Moeljadi","Cahya Wirawan","Frederikus Hudi","Ivan Halim Parmonangan","Ika Alfina","Muhammad Satrio Wicaksono","Ilham Firdausi Putra","Samsul Rahmadani","Yulianti Oenang","Ali Akbar Septiandri","James Jaya","Kaustubh D. Dhole","Arie Ardiyanti Suryani","Rifki Afina Putri","Dan Su","Keith Stevens","Made Nindyatama Nityasya","Muhammad Farid Adilazuarda","Ryan Ignatius","Ryandito Diandaru","Tiezheng Yu","Vito Ghifari","Wenliang Dai","Yan Xu","Dyah Damapuspita","Cuk Tho","Ichwanul Muslim Karo Karo","Tirana Noor Fatyanosa","Ziwei Ji","Pascale Fung","Graham Neubig","Timothy Baldwin","Sebastian Ruder","Herry Sujaini","Sakriani Sakti","Ayu Purwarianti"],"pdf_url":"https://arxiv.org/pdf/2212.09648v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11610v1","updated":"2023-07-21T14:25:39Z","published":"2023-07-21T14:25:39Z","title":"CausE: Towards Causal Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) focuses on representing the entities and\nrelations of a knowledge graph (KG) into the continuous vector spaces, which\ncan be employed to predict the missing triples to achieve knowledge graph\ncompletion (KGC). However, KGE models often only briefly learn structural\ncorrelations of triple data and embeddings would be misled by the trivial\npatterns and noisy links in real-world KGs. To address this issue, we build the\nnew paradigm of KGE in the context of causality and embedding disentanglement.\nWe further propose a Causality-enhanced knowledge graph Embedding (CausE)\nframework. CausE employs causal intervention to estimate the causal effect of\nthe confounder embeddings and design new training objectives to make stable\npredictions. Experimental results demonstrate that CausE could outperform the\nbaseline models and achieve state-of-the-art KGC performance. We release our\ncode in https://github.com/zjukg/CausE.\n","authors":["Yichi Zhang","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.00841v3","updated":"2023-07-21T14:03:40Z","published":"2021-07-02T05:29:39Z","title":"ClueReader: Heterogeneous Graph Attention Network for Multi-hop Machine\n Reading Comprehension","summary":" Multi-hop machine reading comprehension is a challenging task in natural\nlanguage processing as it requires more reasoning ability across multiple\ndocuments. Spectral models based on graph convolutional networks have shown\ngood inferring abilities and lead to competitive results. However, the analysis\nand reasoning of some are inconsistent with those of humans. Inspired by the\nconcept of grandmother cells in cognitive neuroscience, we propose a\nheterogeneous graph attention network model named ClueReader to imitate the\ngrandmother cell concept. The model is designed to assemble the semantic\nfeatures in multi-level representations and automatically concentrate or\nalleviate information for reasoning through the attention mechanism. The name\nClueReader is a metaphor for the pattern of the model: it regards the subjects\nof queries as the starting points of clues, takes the reasoning entities as\nbridge points, considers the latent candidate entities as grandmother cells,\nand the clues end up in candidate entities. The proposed model enables the\nvisualization of the reasoning graph, making it possible to analyze the\nimportance of edges connecting entities and the selectivity in the mention and\ncandidate nodes, which is easier to comprehend empirically. Evaluations on the\nopen-domain multi-hop reading dataset WikiHop and drug-drug interaction dataset\nMedHop proved the validity of ClueReader and showed the feasibility of its\napplication of the model in the molecular biology domain.\n","authors":["Peng Gao","Feng Gao","Peng Wang","Jian-Cheng Ni","Fei Wang","Hamido Fujita"],"pdf_url":"https://arxiv.org/pdf/2107.00841v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11584v1","updated":"2023-07-21T13:48:11Z","published":"2023-07-21T13:48:11Z","title":"A Change of Heart: Improving Speech Emotion Recognition through\n Speech-to-Text Modality Conversion","summary":" Speech Emotion Recognition (SER) is a challenging task. In this paper, we\nintroduce a modality conversion concept aimed at enhancing emotion recognition\nperformance on the MELD dataset. We assess our approach through two\nexperiments: first, a method named Modality-Conversion that employs automatic\nspeech recognition (ASR) systems, followed by a text classifier; second, we\nassume perfect ASR output and investigate the impact of modality conversion on\nSER, this method is called Modality-Conversion++. Our findings indicate that\nthe first method yields substantial results, while the second method\noutperforms state-of-the-art (SOTA) speech-based approaches in terms of SER\nweighted-F1 (WF1) score on the MELD dataset. This research highlights the\npotential of modality conversion for tasks that can be conducted in alternative\nmodalities.\n","authors":["Zeinab Sadat Taghavi","Ali Satvaty","Hossein Sameti"],"pdf_url":"https://arxiv.org/pdf/2307.11584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11558v1","updated":"2023-07-21T13:06:02Z","published":"2023-07-21T13:06:02Z","title":"Advancing Visual Grounding with Scene Knowledge: Benchmark and Method","summary":" Visual grounding (VG) aims to establish fine-grained alignment between vision\nand language. Ideally, it can be a testbed for vision-and-language models to\nevaluate their understanding of the images and texts and their reasoning\nabilities over their joint space. However, most existing VG datasets are\nconstructed using simple description texts, which do not require sufficient\nreasoning over the images and texts. This has been demonstrated in a recent\nstudy~\\cite{luo2022goes}, where a simple LSTM-based text encoder without\npretraining can achieve state-of-the-art performance on mainstream VG datasets.\nTherefore, in this paper, we propose a novel benchmark of \\underline{S}cene\n\\underline{K}nowledge-guided \\underline{V}isual \\underline{G}rounding (SK-VG),\nwhere the image content and referring expressions are not sufficient to ground\nthe target objects, forcing the models to have a reasoning ability on the\nlong-form scene knowledge. To perform this task, we propose two approaches to\naccept the triple-type input, where the former embeds knowledge into the image\nfeatures before the image-query interaction; the latter leverages linguistic\nstructure to assist in computing the image-text matching. We conduct extensive\nexperiments to analyze the above methods and show that the proposed approaches\nachieve promising results but still leave room for improvement, including\nperformance and interpretability. The dataset and code are available at\n\\url{https://github.com/zhjohnchan/SK-VG}.\n","authors":["Zhihong Chen","Ruifei Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11558v1.pdf","comment":"Computer Vision and Natural Language Processing. 21 pages, 14\n figures. CVPR-2023"},{"id":"http://arxiv.org/abs/2307.11545v1","updated":"2023-07-21T12:46:15Z","published":"2023-07-21T12:46:15Z","title":"Bridging Vision and Language Encoders: Parameter-Efficient Tuning for\n Referring Image Segmentation","summary":" Parameter Efficient Tuning (PET) has gained attention for reducing the number\nof parameters while maintaining performance and providing better hardware\nresource savings, but few studies investigate dense prediction tasks and\ninteraction between modalities. In this paper, we do an investigation of\nefficient tuning problems on referring image segmentation. We propose a novel\nadapter called Bridger to facilitate cross-modal information exchange and\ninject task-specific information into the pre-trained model. We also design a\nlightweight decoder for image segmentation. Our approach achieves comparable or\nsuperior performance with only 1.61\\% to 3.38\\% backbone parameter updates,\nevaluated on challenging benchmarks. The code is available at\n\\url{https://github.com/kkakkkka/ETRIS}.\n","authors":["Zunnan Xu","Zhihong Chen","Yong Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11545v1.pdf","comment":"Computer Vision and Natural Language Processing. 14 pages, 8 figures.\n ICCV-2023"},{"id":"http://arxiv.org/abs/2307.11516v1","updated":"2023-07-21T11:54:53Z","published":"2023-07-21T11:54:53Z","title":"IndigoVX: Where Human Intelligence Meets AI for Optimal Decision Making","summary":" This paper defines a new approach for augmenting human intelligence with AI\nfor optimal goal solving. Our proposed AI, Indigo, is an acronym for Informed\nNumerical Decision-making through Iterative Goal-Oriented optimization. When\ncombined with a human collaborator, we term the joint system IndigoVX, for\nVirtual eXpert. The system is conceptually simple. We envisage this method\nbeing applied to games or business strategies, with the human providing\nstrategic context and the AI offering optimal, data-driven moves. Indigo\noperates through an iterative feedback loop, harnessing the human expert's\ncontextual knowledge and the AI's data-driven insights to craft and refine\nstrategies towards a well-defined goal. Using a quantified three-score schema,\nthis hybridization allows the combined team to evaluate strategies and refine\ntheir plan, while adapting to challenges and changes in real-time.\n","authors":["Kais Dukes"],"pdf_url":"https://arxiv.org/pdf/2307.11516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.04900v2","updated":"2023-07-21T11:04:29Z","published":"2021-05-11T09:41:25Z","title":"Forecasting consumer confidence through semantic network analysis of\n online news","summary":" This research studies the impact of online news on social and economic\nconsumer perceptions through semantic network analysis. Using over 1.8 million\nonline articles on Italian media covering four years, we calculate the semantic\nimportance of specific economic-related keywords to see if words appearing in\nthe articles could anticipate consumers' judgments about the economic situation\nand the Consumer Confidence Index. We use an innovative approach to analyze big\ntextual data, combining methods and tools of text mining and social network\nanalysis. Results show a strong predictive power for the judgments about the\ncurrent households and national situation. Our indicator offers a complementary\napproach to estimating consumer confidence, lessening the limitations of\ntraditional survey-based methods.\n","authors":["A. Fronzetti Colladon","F. Grippa","B. Guardabascio","G. Costante","F. Ravazzolo"],"pdf_url":"https://arxiv.org/pdf/2105.04900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12851v2","updated":"2023-07-21T10:22:53Z","published":"2023-05-22T09:20:58Z","title":"Enhancing Coherence of Extractive Summarization with Multitask Learning","summary":" This study proposes a multitask learning architecture for extractive\nsummarization with coherence boosting. The architecture contains an extractive\nsummarizer and coherent discriminator module. The coherent discriminator is\ntrained online on the sentence vectors of the augmented textual input, thus\nimproving its general ability of judging whether the input sentences are\ncoherent. Meanwhile, we maximize the coherent scores from the coherent\ndiscriminator by updating the parameters of the summarizer. To make the\nextractive sentences trainable in a differentiable manner, we introduce two\nstrategies, including pre-trained converting model (model-based) and converting\nmatrix (MAT-based) that merge sentence representations. Experiments show that\nour proposed method significantly improves the proportion of consecutive\nsentences in the extracted summaries based on their positions in the original\narticle (i.e., automatic sentence-level coherence metric), while the goodness\nin terms of other automatic metrics (i.e., Rouge scores and BertScores) are\npreserved. Human evaluation also evidences the improvement of coherence and\nconsistency of the extracted summaries given by our method.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2305.12851v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.11457v1","updated":"2023-07-21T09:39:50Z","published":"2023-07-21T09:39:50Z","title":"Incorporating Human Translator Style into English-Turkish Literary\n Machine Translation","summary":" Although machine translation systems are mostly designed to serve in the\ngeneral domain, there is a growing tendency to adapt these systems to other\ndomains like literary translation. In this paper, we focus on English-Turkish\nliterary translation and develop machine translation models that take into\naccount the stylistic features of translators. We fine-tune a pre-trained\nmachine translation model by the manually-aligned works of a particular\ntranslator. We make a detailed analysis of the effects of manual and automatic\nalignments, data augmentation methods, and corpus size on the translations. We\npropose an approach based on stylistic features to evaluate the style of a\ntranslator in the output translations. We show that the human translator style\ncan be highly recreated in the target machine translations by adapting the\nmodels to the style of the translator.\n","authors":["Zeynep Yirmibeşoğlu","Olgun Dursun","Harun Dallı","Mehmet Şahin","Ena Hodzik","Sabri Gürses","Tunga Güngör"],"pdf_url":"https://arxiv.org/pdf/2307.11457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11450v1","updated":"2023-07-21T09:30:46Z","published":"2023-07-21T09:30:46Z","title":"Topic Identification For Spontaneous Speech: Enriching Audio Features\n With Embedded Linguistic Information","summary":" Traditional topic identification solutions from audio rely on an automatic\nspeech recognition system (ASR) to produce transcripts used as input to a\ntext-based model. These approaches work well in high-resource scenarios, where\nthere are sufficient data to train both components of the pipeline. However, in\nlow-resource situations, the ASR system, even if available, produces\nlow-quality transcripts, leading to a bad text-based classifier. Moreover,\nspontaneous speech containing hesitations can further degrade the performance\nof the ASR model. In this paper, we investigate alternatives to the standard\ntext-only solutions by comparing audio-only and hybrid techniques of jointly\nutilising text and audio features. The models evaluated on spontaneous Finnish\nspeech demonstrate that purely audio-based solutions are a viable option when\nASR components are not available, while the hybrid multi-modal solutions\nachieve the best results.\n","authors":["Dejan Porjazovski","Tamás Grósz","Mikko Kurimo"],"pdf_url":"https://arxiv.org/pdf/2307.11450v1.pdf","comment":"Accepted to EUSIPCO 2023"},{"id":"http://arxiv.org/abs/2306.14096v3","updated":"2023-07-21T08:57:38Z","published":"2023-06-25T02:24:30Z","title":"Chinese Fine-Grained Financial Sentiment Analysis with Large Language\n Models","summary":" Entity-level fine-grained sentiment analysis in the financial domain is a\ncrucial subtask of sentiment analysis and currently faces numerous challenges.\nThe primary challenge stems from the lack of high-quality and large-scale\nannotated corpora specifically designed for financial text sentiment analysis,\nwhich in turn limits the availability of data necessary for developing\neffective text processing techniques. Recent advancements in large language\nmodels (LLMs) have yielded remarkable performance in natural language\nprocessing tasks, primarily centered around language pattern matching. In this\npaper, we propose a novel and extensive Chinese fine-grained financial\nsentiment analysis dataset, FinChina SA, for enterprise early warning. We\nthoroughly evaluate and experiment with well-known existing open-source LLMs\nusing our dataset. We firmly believe that our dataset will serve as a valuable\nresource to advance the exploration of real-world financial sentiment analysis\ntasks, which should be the focus of future research. Our dataset and all code\nto replicate the experimental results will be released.\n","authors":["Yinyu Lan","Yanru Wu","Wang Xu","Weiqiang Feng","Youhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.14096v3.pdf","comment":"FinLLM Symposium at IJCAI 2023"},{"id":"http://arxiv.org/abs/2306.02250v2","updated":"2023-07-21T07:46:03Z","published":"2023-06-04T03:46:45Z","title":"Large Language Model Augmented Narrative Driven Recommendations","summary":" Narrative-driven recommendation (NDR) presents an information access problem\nwhere users solicit recommendations with verbose descriptions of their\npreferences and context, for example, travelers soliciting recommendations for\npoints of interest while describing their likes/dislikes and travel\ncircumstances. These requests are increasingly important with the rise of\nnatural language-based conversational interfaces for search and recommendation\nsystems. However, NDR lacks abundant training data for models, and current\nplatforms commonly do not support these requests. Fortunately, classical\nuser-item interaction datasets contain rich textual data, e.g., reviews, which\noften describe user preferences and context - this may be used to bootstrap\ntraining for NDR models. In this work, we explore using large language models\n(LLMs) for data augmentation to train NDR models. We use LLMs for authoring\nsynthetic narrative queries from user-item interactions with few-shot prompting\nand train retrieval models for NDR on synthetic queries and user-item\ninteraction data. Our experiments demonstrate that this is an effective\nstrategy for training small-parameter retrieval models that outperform other\nretrieval and LLM baselines for narrative-driven recommendation.\n","authors":["Sheshera Mysore","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2306.02250v2.pdf","comment":"RecSys 2023 Camera-ready"},{"id":"http://arxiv.org/abs/2304.04250v2","updated":"2023-07-21T07:39:58Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":" Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v2.pdf","comment":"SIGIR-2023 Camera Ready"},{"id":"http://arxiv.org/abs/2307.11394v1","updated":"2023-07-21T07:22:18Z","published":"2023-07-21T07:22:18Z","title":"MeetEval: A Toolkit for Computation of Word Error Rates for Meeting\n Transcription Systems","summary":" MeetEval is an open-source toolkit to evaluate all kinds of meeting\ntranscription systems. It provides a unified interface for the computation of\ncommonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER\nalong other WER definitions. We extend the cpWER computation by a temporal\nconstraint to ensure that only words are identified as correct when the\ntemporal alignment is plausible. This leads to a better quality of the matching\nof the hypothesis string to the reference string that more closely resembles\nthe actual transcription quality, and a system is penalized if it provides poor\ntime annotations. Since word-level timing information is often not available,\nwe present a way to approximate exact word-level timings from segment-level\ntimings (e.g., a sentence) and show that the approximation leads to a similar\nWER as a matching with exact word-level annotations. At the same time, the time\nconstraint leads to a speedup of the matching algorithm, which outweighs the\nadditional overhead caused by processing the time stamps.\n","authors":["Thilo von Neumann","Christoph Boeddeker","Marc Delcroix","Reinhold Haeb-Umbach"],"pdf_url":"https://arxiv.org/pdf/2307.11394v1.pdf","comment":"Accepted for presentation at the Chime7 workshop 2023"},{"id":"http://arxiv.org/abs/2306.17519v2","updated":"2023-07-21T06:57:49Z","published":"2023-06-30T10:12:30Z","title":"GPT-FinRE: In-context Learning for Financial Relation Extraction using\n Large Language Models","summary":" Relation extraction (RE) is a crucial task in natural language processing\n(NLP) that aims to identify and classify relationships between entities\nmentioned in text. In the financial domain, relation extraction plays a vital\nrole in extracting valuable information from financial documents, such as news\narticles, earnings reports, and company filings. This paper describes our\nsolution to relation extraction on one such dataset REFinD. The dataset was\nreleased along with shared task as a part of the Fourth Workshop on Knowledge\nDiscovery from Unstructured Data in Financial Services, co-located with SIGIR\n2023. In this paper, we employed OpenAI models under the framework of\nin-context learning (ICL). We utilized two retrieval strategies to find top K\nrelevant in-context learning demonstrations / examples from training data for a\ngiven test example. The first retrieval mechanism, we employed, is a\nlearning-free dense retriever and the other system is a learning-based\nretriever. We were able to achieve 3rd rank overall. Our best F1-score is\n0.718.\n","authors":["Pawan Kumar Rajpoot","Ankur Parikh"],"pdf_url":"https://arxiv.org/pdf/2306.17519v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.02105 by other authors"},{"id":"http://arxiv.org/abs/2307.11380v1","updated":"2023-07-21T06:38:37Z","published":"2023-07-21T06:38:37Z","title":"Is ChatGPT Involved in Texts? Measure the Polish Ratio to Detect\n ChatGPT-Generated Text","summary":" The remarkable capabilities of large-scale language models, such as ChatGPT,\nin text generation have incited awe and spurred researchers to devise detectors\nto mitigate potential risks, including misinformation, phishing, and academic\ndishonesty. Despite this, most previous studies, including HC3, have been\npredominantly geared towards creating detectors that differentiate between\npurely ChatGPT-generated texts and human-authored texts. This approach,\nhowever, fails to work on discerning texts generated through human-machine\ncollaboration, such as ChatGPT-polished texts. Addressing this gap, we\nintroduce a novel dataset termed HPPT (ChatGPT-polished academic abstracts),\nfacilitating the construction of more robust detectors. It diverges from extant\ncorpora by comprising pairs of human-written and ChatGPT-polished abstracts\ninstead of purely ChatGPT-generated texts. Additionally, we propose the \"Polish\nRatio\" method, an innovative measure of ChatGPT's involvement in text\ngeneration based on editing distance. It provides a mechanism to measure the\ndegree of human originality in the resulting text. Our experimental results\nshow our proposed model has better robustness on the HPPT dataset and two\nexisting datasets (HC3 and CDB). Furthermore, the \"Polish Ratio\" we proposed\noffers a more comprehensive explanation by quantifying the degree of ChatGPT\ninvolvement, which indicates that a Polish Ratio value greater than 0.2\nsignifies ChatGPT involvement and a value exceeding 0.6 implies that ChatGPT\ngenerates most of the text.\n","authors":["Lingyi Yang","Feng Jiang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2307.11380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11346v1","updated":"2023-07-21T04:43:00Z","published":"2023-07-21T04:43:00Z","title":"CohortGPT: An Enhanced GPT for Participant Recruitment in Clinical Study","summary":" Participant recruitment based on unstructured medical texts such as clinical\nnotes and radiology reports has been a challenging yet important task for the\ncohort establishment in clinical research. Recently, Large Language Models\n(LLMs) such as ChatGPT have achieved tremendous success in various downstream\ntasks thanks to their promising performance in language understanding,\ninference, and generation. It is then natural to test their feasibility in\nsolving the cohort recruitment task, which involves the classification of a\ngiven paragraph of medical text into disease label(s). However, when applied to\nknowledge-intensive problem settings such as medical text classification, where\nthe LLMs are expected to understand the decision made by human experts and\naccurately identify the implied disease labels, the LLMs show a mediocre\nperformance. A possible explanation is that, by only using the medical text,\nthe LLMs neglect to use the rich context of additional information that\nlanguages afford. To this end, we propose to use a knowledge graph as auxiliary\ninformation to guide the LLMs in making predictions. Moreover, to further boost\nthe LLMs adapt to the problem setting, we apply a chain-of-thought (CoT) sample\nselection strategy enhanced by reinforcement learning, which selects a set of\nCoT samples given each individual medical report. Experimental results and\nvarious ablation studies show that our few-shot learning method achieves\nsatisfactory performance compared with fine-tuning strategies and gains superb\nadvantages when the available data is limited. The code and sample dataset of\nthe proposed CohortGPT model is available at:\nhttps://anonymous.4open.science/r/CohortGPT-4872/\n","authors":["Zihan Guan","Zihao Wu","Zhengliang Liu","Dufan Wu","Hui Ren","Quanzheng Li","Xiang Li","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.11346v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.11344v1","updated":"2023-07-21T04:22:43Z","published":"2023-07-21T04:22:43Z","title":"DEFTri: A Few-Shot Label Fused Contextual Representation Learning For\n Product Defect Triage in e-Commerce","summary":" Defect Triage is a time-sensitive and critical process in a large-scale agile\nsoftware development lifecycle for e-commerce. Inefficiencies arising from\nhuman and process dependencies in this domain have motivated research in\nautomated approaches using machine learning to accurately assign defects to\nqualified teams. This work proposes a novel framework for automated defect\ntriage (DEFTri) using fine-tuned state-of-the-art pre-trained BERT on labels\nfused text embeddings to improve contextual representations from\nhuman-generated product defects. For our multi-label text classification defect\ntriage task, we also introduce a Walmart proprietary dataset of product defects\nusing weak supervision and adversarial learning, in a few-shot setting.\n","authors":["Ipsita Mohanty"],"pdf_url":"https://arxiv.org/pdf/2307.11344v1.pdf","comment":"In Proceedings of the Fifth Workshop on e-Commerce and NLP ECNLP 5\n 2022 Pages 1-7"},{"id":"http://arxiv.org/abs/2307.11316v1","updated":"2023-07-21T02:51:41Z","published":"2023-07-21T02:51:41Z","title":"Making Pre-trained Language Models both Task-solvers and\n Self-calibrators","summary":" Pre-trained language models (PLMs) serve as backbones for various real-world\nsystems. For high-stake applications, it's equally essential to have reasonable\nconfidence estimations in predictions. While the vanilla confidence scores of\nPLMs can already be effectively utilized, PLMs consistently become\noverconfident in their wrong predictions, which is not desirable in practice.\nPrevious work shows that introducing an extra calibration task can mitigate\nthis issue. The basic idea involves acquiring additional data to train models\nin predicting the confidence of their initial predictions. However, it only\ndemonstrates the feasibility of this kind of method, assuming that there are\nabundant extra available samples for the introduced calibration task. In this\nwork, we consider the practical scenario that we need to effectively utilize\ntraining samples to make PLMs both task-solvers and self-calibrators. Three\nchallenges are presented, including limited training samples, data imbalance,\nand distribution shifts. We first conduct pilot experiments to quantify various\ndecisive factors in the calibration task. Based on the empirical analysis\nresults, we propose a training algorithm LM-TOAST to tackle the challenges.\nExperimental results show that LM-TOAST can effectively utilize the training\ndata to make PLMs have reasonable confidence estimations while maintaining the\noriginal task performance. Further, we consider three downstream applications,\nnamely selective classification, adversarial defense, and model cascading, to\nshow the practical usefulness of LM-TOAST. The code will be made public at\n\\url{https://github.com/Yangyi-Chen/LM-TOAST}.\n","authors":["Yangyi Chen","Xingyao Wang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2307.11316v1.pdf","comment":"Accepted to Findings of ACL 2023"},{"id":"http://arxiv.org/abs/2307.11315v1","updated":"2023-07-21T02:47:18Z","published":"2023-07-21T02:47:18Z","title":"Generating Image-Specific Text Improves Fine-grained Image\n Classification","summary":" Recent vision-language models outperform vision-only models on many image\nclassification tasks. However, because of the absence of paired text/image\ndescriptions, it remains difficult to fine-tune these models for fine-grained\nimage classification. In this work, we propose a method, GIST, for generating\nimage-specific fine-grained text descriptions from image-only datasets, and\nshow that these text descriptions can be used to improve classification. Key\nparts of our method include 1. prompting a pretrained large language model with\ndomain-specific prompts to generate diverse fine-grained text descriptions for\neach class and 2. using a pretrained vision-language model to match each image\nto label-preserving text descriptions that capture relevant visual features in\nthe image. We demonstrate the utility of GIST by fine-tuning vision-language\nmodels on the image-and-generated-text pairs to learn an aligned\nvision-language representation space for improved classification. We evaluate\nour learned representation space in full-shot and few-shot scenarios across\nfour diverse fine-grained classification datasets, each from a different\ndomain. Our method achieves an average improvement of $4.1\\%$ in accuracy over\nCLIP linear probes and an average of $1.1\\%$ improvement in accuracy over the\nprevious state-of-the-art image-text classification method on the full-shot\ndatasets. Our method achieves similar improvements across few-shot regimes.\nCode is available at https://github.com/emu1729/GIST.\n","authors":["Emily Mu","Kathleen M. Lewis","Adrian V. Dalca","John Guttag"],"pdf_url":"https://arxiv.org/pdf/2307.11315v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2307.10291v2","updated":"2023-07-21T02:34:58Z","published":"2023-07-18T14:30:36Z","title":"Mutual Reinforcement Effects in Japanese Sentence Classification and\n Named Entity Recognition Tasks","summary":" Information extraction(IE) is a crucial subfield within natural language\nprocessing. However, for the traditionally segmented approach to sentence\nclassification and Named Entity Recognition, the intricate interactions between\nthese individual subtasks remain largely uninvestigated. In this study, we\npropose an integrative analysis, converging sentence classification with Named\nEntity Recognition, with the objective to unveil and comprehend the mutual\nreinforcement effect within these two information extraction subtasks. To\nachieve this, we introduce a Sentence Classification and Named Entity\nRecognition Multi-task (SCNM) approach that combines Sentence Classification\n(SC) and Named Entity Recognition (NER). We develop a Sentence-to-Label\nGeneration (SLG) framework for SCNM and construct a Wikipedia dataset\ncontaining both SC and NER. Using a format converter, we unify input formats\nand employ a generative model to generate SC-labels, NER-labels, and associated\ntext segments. We propose a Constraint Mechanism (CM) to improve generated\nformat accuracy. Our results show SC accuracy increased by 1.13 points and NER\nby 1.06 points in SCNM compared to standalone tasks, with CM raising format\naccuracy from 63.61 to 100. The findings indicate mutual reinforcement effects\nbetween SC and NER, and integration enhances both tasks' performance. We\nadditionally implemented the SLG framework on single SC task. It yielded\nsuperior accuracies compared to the baseline on two distinct Japanese SC\ndatasets. Notably, in the experiment of few-shot learning, SLG framework shows\nmuch better performance than fine-tune method. These empirical findings\ncontribute additional evidence to affirm the efficacy of the SLG framework.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2307.10291v2.pdf","comment":"25 pages, 12 figures, 19 tables. arXiv admin note: substantial text\n overlap with arXiv:2306.15978"},{"id":"http://arxiv.org/abs/2307.10432v2","updated":"2023-07-21T02:22:14Z","published":"2023-07-19T19:40:34Z","title":"PharmacyGPT: The AI Pharmacist","summary":" In this study, we introduce PharmacyGPT, a novel framework to assess the\ncapabilities of large language models (LLMs) such as ChatGPT and GPT-4 in\nemulating the role of clinical pharmacists. Our methodology encompasses the\nutilization of LLMs to generate comprehensible patient clusters, formulate\nmedication plans, and forecast patient outcomes. We conduct our investigation\nusing real data acquired from the intensive care unit (ICU) at the University\nof North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable\ninsights into the potential applications and limitations of LLMs in the field\nof clinical pharmacy, with implications for both patient care and the\ndevelopment of future AI-driven healthcare solutions. By evaluating the\nperformance of PharmacyGPT, we aim to contribute to the ongoing discourse\nsurrounding the integration of artificial intelligence in healthcare settings,\nultimately promoting the responsible and efficacious use of such technologies.\n","authors":["Zhengliang Liu","Zihao Wu","Mengxuan Hu","Bokai Zhao","Lin Zhao","Tianyi Zhang","Haixing Dai","Xianyan Chen","Ye Shen","Sheng Li","Brian Murray","Tianming Liu","Andrea Sikora"],"pdf_url":"https://arxiv.org/pdf/2307.10432v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13971v2","updated":"2023-07-21T01:58:13Z","published":"2023-06-24T13:57:32Z","title":"Towards Robust Aspect-based Sentiment Analysis through\n Non-counterfactual Augmentations","summary":" While state-of-the-art NLP models have demonstrated excellent performance for\naspect based sentiment analysis (ABSA), substantial evidence has been presented\non their lack of robustness. This is especially manifested as significant\ndegradation in performance when faced with out-of-distribution data. Recent\nsolutions that rely on counterfactually augmented datasets show promising\nresults, but they are inherently limited because of the lack of access to\nexplicit causal structure. In this paper, we present an alternative approach\nthat relies on non-counterfactual data augmentation. Our proposal instead\nrelies on using noisy, cost-efficient data augmentations that preserve\nsemantics associated with the target aspect. Our approach then relies on\nmodelling invariances between different versions of the data to improve\nrobustness. A comprehensive suite of experiments shows that our proposal\nsignificantly improves upon strong pre-trained baselines on both standard and\nrobustness-specific datasets. Our approach further establishes a new\nstate-of-the-art on the ABSA robustness benchmark and transfers well across\ndomains.\n","authors":["Xinyu Liu","Yan Ding","Kaikai An","Chunyang Xiao","Pranava Madhyastha","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2306.13971v2.pdf","comment":"10pages,1 figure,10 tables"},{"id":"http://arxiv.org/abs/2307.11278v1","updated":"2023-07-21T00:34:38Z","published":"2023-07-21T00:34:38Z","title":"Generator-Retriever-Generator: A Novel Approach to Open-domain Question\n Answering","summary":" Open-domain question answering (QA) tasks usually require the retrieval of\nrelevant information from a large corpus to generate accurate answers. We\npropose a novel approach called Generator-Retriever-Generator (GRG) that\ncombines document retrieval techniques with a large language model (LLM), by\nfirst prompting the model to generate contextual documents based on a given\nquestion. In parallel, a dual-encoder network retrieves documents that are\nrelevant to the question from an external corpus. The generated and retrieved\ndocuments are then passed to the second LLM, which generates the final answer.\nBy combining document retrieval and LLM generation, our approach addresses the\nchallenges of open-domain QA, such as generating informative and contextually\nrelevant answers. GRG outperforms the state-of-the-art generate-then-read and\nretrieve-then-read pipelines (GENREAD and RFiD) improving their performance at\nleast by +5.2, +4.2, and +1.6 on TriviaQA, NQ, and WebQ datasets, respectively.\nWe provide code, datasets, and checkpoints\n\\footnote{\\url{https://github.com/abdoelsayed2016/GRG}}\n","authors":["Abdelrahman Abdallah","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2307.11278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08584v3","updated":"2023-07-21T22:08:45Z","published":"2022-11-15T23:57:34Z","title":"Toward expanding the scope of radiology report summarization to multiple\n anatomies and modalities","summary":" Radiology report summarization (RRS) is a growing area of research. Given the\nFindings section of a radiology report, the goal is to generate a summary\n(called an Impression section) that highlights the key observations and\nconclusions of the radiology study. However, RRS currently faces essential\nlimitations.First, many prior studies conduct experiments on private datasets,\npreventing reproduction of results and fair comparisons across different\nsystems and solutions. Second, most prior approaches are evaluated solely on\nchest X-rays. To address these limitations, we propose a dataset (MIMIC-RRS)\ninvolving three new modalities and seven new anatomies based on the MIMIC-III\nand MIMIC-CXR datasets. We then conduct extensive experiments to evaluate the\nperformance of models both within and across modality-anatomy pairs in\nMIMIC-RRS. In addition, we evaluate their clinical efficacy via RadGraph, a\nfactual correctness metric.\n","authors":["Zhihong Chen","Maya Varma","Xiang Wan","Curtis Langlotz","Jean-Benoit Delbrouck"],"pdf_url":"https://arxiv.org/pdf/2211.08584v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11922v1","updated":"2023-07-21T22:02:50Z","published":"2023-07-21T22:02:50Z","title":"Selective Perception: Optimizing State Descriptions with Reinforcement\n Learning for Language Model Actors","summary":" Large language models (LLMs) are being applied as actors for sequential\ndecision making tasks in domains such as robotics and games, utilizing their\ngeneral world knowledge and planning abilities. However, previous work does\nlittle to explore what environment state information is provided to LLM actors\nvia language. Exhaustively describing high-dimensional states can impair\nperformance and raise inference costs for LLM actors. Previous LLM actors avoid\nthe issue by relying on hand-engineered, task-specific protocols to determine\nwhich features to communicate about a state and which to leave out. In this\nwork, we propose Brief Language INputs for DEcision-making Responses (BLINDER),\na method for automatically selecting concise state descriptions by learning a\nvalue function for task-conditioned state descriptions. We evaluate BLINDER on\nthe challenging video game NetHack and a robotic manipulation task. Our method\nimproves task success rate, reduces input size and compute costs, and\ngeneralizes between LLM actors.\n","authors":["Kolby Nottingham","Yasaman Razeghi","Kyungmin Kim","JB Lanier","Pierre Baldi","Roy Fox","Sameer Singh"],"pdf_url":"https://arxiv.org/pdf/2307.11922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03341v3","updated":"2023-07-21T19:11:58Z","published":"2023-06-06T01:26:53Z","title":"Inference-Time Intervention: Eliciting Truthful Answers from a Language\n Model","summary":" We introduce Inference-Time Intervention (ITI), a technique designed to\nenhance the truthfulness of large language models (LLMs). ITI operates by\nshifting model activations during inference, following a set of directions\nacross a limited number of attention heads. This intervention significantly\nimproves the performance of LLaMA models on the TruthfulQA benchmark. On an\ninstruction-finetuned LLaMA called Alpaca, ITI improves its truthfulness from\n32.5% to 65.1%. We identify a tradeoff between truthfulness and helpfulness and\ndemonstrate how to balance it by tuning the intervention strength. ITI is\nminimally invasive and computationally inexpensive. Moreover, the technique is\ndata efficient: while approaches like RLHF require extensive annotations, ITI\nlocates truthful directions using only few hundred examples. Our findings\nsuggest that LLMs may have an internal representation of the likelihood of\nsomething being true, even as they produce falsehoods on the surface.\n","authors":["Kenneth Li","Oam Patel","Fernanda Viégas","Hanspeter Pfister","Martin Wattenberg"],"pdf_url":"https://arxiv.org/pdf/2306.03341v3.pdf","comment":"code: https://github.com/likenneth/honest_llama"},{"id":"http://arxiv.org/abs/2307.11865v1","updated":"2023-07-21T19:09:37Z","published":"2023-07-21T19:09:37Z","title":"CARTIER: Cartographic lAnguage Reasoning Targeted at Instruction\n Execution for Robots","summary":" This work explores the capacity of large language models (LLMs) to address\nproblems at the intersection of spatial planning and natural language\ninterfaces for navigation.Our focus is on following relatively complex\ninstructions that are more akin to natural conversation than traditional\nexplicit procedural directives seen in robotics. Unlike most prior work, where\nnavigation directives are provided as imperative commands (e.g., go to the\nfridge), we examine implicit directives within conversational interactions. We\nleverage the 3D simulator AI2Thor to create complex and repeatable scenarios at\nscale, and augment it by adding complex language queries for 40 object types.\nWe demonstrate that a robot can better parse descriptive language queries than\nexisting methods by using an LLM to interpret the user interaction in the\ncontext of a list of the objects in the scene.\n","authors":["Nikhil Kakodkar","Dmitriy Rivkin","Bobak H. Baghi","Francois Hogan","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2307.11865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11864v1","updated":"2023-07-21T19:09:24Z","published":"2023-07-21T19:09:24Z","title":"The Looming Threat of Fake and LLM-generated LinkedIn Profiles:\n Challenges and Opportunities for Detection and Prevention","summary":" In this paper, we present a novel method for detecting fake and Large\nLanguage Model (LLM)-generated profiles in the LinkedIn Online Social Network\nimmediately upon registration and before establishing connections. Early fake\nprofile identification is crucial to maintaining the platform's integrity since\nit prevents imposters from acquiring the private and sensitive information of\nlegitimate users and from gaining an opportunity to increase their credibility\nfor future phishing and scamming activities. This work uses textual information\nprovided in LinkedIn profiles and introduces the Section and Subsection Tag\nEmbedding (SSTE) method to enhance the discriminative characteristics of these\ndata for distinguishing between legitimate profiles and those created by\nimposters manually or by using an LLM. Additionally, the dearth of a large\npublicly available LinkedIn dataset motivated us to collect 3600 LinkedIn\nprofiles for our research. We will release our dataset publicly for research\npurposes. This is, to the best of our knowledge, the first large publicly\navailable LinkedIn dataset for fake LinkedIn account detection. Within our\nparadigm, we assess static and contextualized word embeddings, including GloVe,\nFlair, BERT, and RoBERTa. We show that the suggested method can distinguish\nbetween legitimate and fake profiles with an accuracy of about 95% across all\nword embeddings. In addition, we show that SSTE has a promising accuracy for\nidentifying LLM-generated profiles, despite the fact that no LLM-generated\nprofiles were employed during the training phase, and can achieve an accuracy\nof approximately 90% when only 20 LLM-generated profiles are added to the\ntraining set. It is a significant finding since the proliferation of several\nLLMs in the near future makes it extremely challenging to design a single\nsystem that can identify profiles created with various LLMs.\n","authors":["Navid Ayoobi","Sadat Shahriar","Arjun Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2307.11864v1.pdf","comment":"33rd ACM Conference on Hypertext and Social Media (HT '23)"},{"id":"http://arxiv.org/abs/2307.11848v1","updated":"2023-07-21T18:35:24Z","published":"2023-07-21T18:35:24Z","title":"MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through\n Multi-Answer Open-Domain Question Answering","summary":" Check-worthy claim detection aims at providing plausible misinformation to\ndownstream fact-checking systems or human experts to check. This is a crucial\nstep toward accelerating the fact-checking process. Many efforts have been put\ninto how to identify check-worthy claims from a small scale of pre-collected\nclaims, but how to efficiently detect check-worthy claims directly from a\nlarge-scale information source, such as Twitter, remains underexplored. To fill\nthis gap, we introduce MythQA, a new multi-answer open-domain question\nanswering(QA) task that involves contradictory stance mining for query-based\nlarge-scale check-worthy claim detection. The idea behind this is that\ncontradictory claims are a strong indicator of misinformation that merits\nscrutiny by the appropriate authorities. To study this task, we construct\nTweetMythQA, an evaluation dataset containing 522 factoid multi-answer\nquestions based on controversial topics. Each question is annotated with\nmultiple answers. Moreover, we collect relevant tweets for each distinct\nanswer, then classify them into three categories: \"Supporting\", \"Refuting\", and\n\"Neutral\". In total, we annotated 5.3K tweets. Contradictory evidence is\ncollected for all answers in the dataset. Finally, we present a baseline system\nfor MythQA and evaluate existing NLP models for each system component using the\nTweetMythQA dataset. We provide initial benchmarks and identify key challenges\nfor future models to improve upon. Code and data are available at:\nhttps://github.com/TonyBY/Myth-QA\n","authors":["Yang Bai","Anthony Colas","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11848v1.pdf","comment":"Accepted by SIGIR 2023"},{"id":"http://arxiv.org/abs/2307.11845v1","updated":"2023-07-21T18:29:04Z","published":"2023-07-21T18:29:04Z","title":"Multimodal Document Analytics for Banking Process Automation","summary":" In response to growing FinTech competition and the need for improved\noperational efficiency, this research focuses on understanding the potential of\nadvanced document analytics, particularly using multimodal models, in banking\nprocesses. We perform a comprehensive analysis of the diverse banking document\nlandscape, highlighting the opportunities for efficiency gains through\nautomation and advanced analytics techniques in the customer business. Building\non the rapidly evolving field of natural language processing (NLP), we\nillustrate the potential of models such as LayoutXLM, a cross-lingual,\nmultimodal, pre-trained model, for analyzing diverse documents in the banking\nsector. This model performs a text token classification on German company\nregister extracts with an overall F1 score performance of around 80\\%. Our\nempirical evidence confirms the critical role of layout information in\nimproving model performance and further underscores the benefits of integrating\nimage information. Interestingly, our study shows that over 75% F1 score can be\nachieved with only 30% of the training data, demonstrating the efficiency of\nLayoutXLM. Through addressing state-of-the-art document analysis frameworks,\nour study aims to enhance process efficiency and demonstrate the real-world\napplicability and benefits of multimodal models within banking.\n","authors":["Christopher Gerling","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2307.11845v1.pdf","comment":"A Preprint"},{"id":"http://arxiv.org/abs/2307.11795v1","updated":"2023-07-21T08:39:15Z","published":"2023-07-21T08:39:15Z","title":"Prompting Large Language Models with Speech Recognition Abilities","summary":" Large language models have proven themselves highly flexible, able to solve a\nwide range of generative tasks, such as abstractive summarization and\nopen-ended question answering. In this paper we extend the capabilities of LLMs\nby directly attaching a small audio encoder allowing it to perform speech\nrecognition. By directly prepending a sequence of audial embeddings to the text\ntoken embeddings, the LLM can be converted to an automatic speech recognition\n(ASR) system, and be used in the exact same manner as its textual counterpart.\nExperiments on Multilingual LibriSpeech (MLS) show that incorporating a\nconformer encoder into the open sourced LLaMA-7B allows it to outperform\nmonolingual baselines by 18% and perform multilingual speech recognition\ndespite LLaMA being trained overwhelmingly on English text. Furthermore, we\nperform ablation studies to investigate whether the LLM can be completely\nfrozen during training to maintain its original capabilities, scaling up the\naudio encoder, and increasing the audio encoder striding to generate fewer\nembeddings. The results from these studies show that multilingual ASR is\npossible even when the LLM is frozen or when strides of almost 1 second are\nused in the audio encoder opening up the possibility for LLMs to operate on\nlong-form audio.\n","authors":["Yassir Fathullah","Chunyang Wu","Egor Lakomkin","Junteng Jia","Yuan Shangguan","Ke Li","Jinxi Guo","Wenhan Xiong","Jay Mahadeokar","Ozlem Kalinli","Christian Fuegen","Mike Seltzer"],"pdf_url":"https://arxiv.org/pdf/2307.11795v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.11748v1","updated":"2023-07-21T17:58:47Z","published":"2023-07-21T17:58:47Z","title":"BandRe: Rethinking Band-Pass Filters for Scale-Wise Object Detection\n Evaluation","summary":" Scale-wise evaluation of object detectors is important for real-world\napplications. However, existing metrics are either coarse or not sufficiently\nreliable. In this paper, we propose novel scale-wise metrics that strike a\nbalance between fineness and reliability, using a filter bank consisting of\ntriangular and trapezoidal band-pass filters. We conduct experiments with two\nmethods on two datasets and show that the proposed metrics can highlight the\ndifferences between the methods and between the datasets. Code is available at\nhttps://github.com/shinya7y/UniverseNet .\n","authors":["Yosuke Shinya"],"pdf_url":"https://arxiv.org/pdf/2307.11748v1.pdf","comment":"Honorable Mention Solution Award in Small Object Detection Challenge\n for Spotting Birds, International Conference on Machine Vision Applications\n (MVA) 2023"},{"id":"http://arxiv.org/abs/2108.02226v2","updated":"2023-07-21T17:27:10Z","published":"2021-08-04T18:08:28Z","title":"Terabyte-scale supervised 3D training and benchmarking dataset of the\n mouse kidney","summary":" The performance of machine learning algorithms, when used for segmenting 3D\nbiomedical images, does not reach the level expected based on results achieved\nwith 2D photos. This may be explained by the comparative lack of high-volume,\nhigh-quality training datasets, which require state-of-the-art imaging\nfacilities, domain experts for annotation and large computational and personal\nresources. The HR-Kidney dataset presented in this work bridges this gap by\nproviding 1.7 TB of artefact-corrected synchrotron radiation-based X-ray\nphase-contrast microtomography images of whole mouse kidneys and validated\nsegmentations of 33 729 glomeruli, which corresponds to a one to two orders of\nmagnitude increase over currently available biomedical datasets. The image sets\nalso contain the underlying raw data, threshold- and morphology-based\nsemi-automatic segmentations of renal vasculature and uriniferous tubules, as\nwell as true 3D manual annotations. We therewith provide a broad basis for the\nscientific community to build upon and expand in the fields of image\nprocessing, data augmentation and machine learning, in particular unsupervised\nand semi-supervised learning investigations, as well as transfer learning and\ngenerative adversarial networks.\n","authors":["Willy Kuo","Diego Rossinelli","Georg Schulz","Roland H. Wenger","Simone Hieber","Bert Müller","Vartan Kurtcuoglu"],"pdf_url":"https://arxiv.org/pdf/2108.02226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10656v2","updated":"2023-07-21T17:15:24Z","published":"2023-03-19T13:41:59Z","title":"More From Less: Self-Supervised Knowledge Distillation for Routine\n Histopathology Data","summary":" Medical imaging technologies are generating increasingly large amounts of\nhigh-quality, information-dense data. Despite the progress, practical use of\nadvanced imaging technologies for research and diagnosis remains limited by\ncost and availability, so information-sparse data such as H&E stains are relied\non in practice. The study of diseased tissue requires methods which can\nleverage these information-dense data to extract more value from routine,\ninformation-sparse data. Using self-supervised deep learning, we demonstrate\nthat it is possible to distil knowledge during training from information-dense\ndata into models which only require information-sparse data for inference. This\nimproves downstream classification accuracy on information-sparse data, making\nit comparable with the fully-supervised baseline. We find substantial effects\non the learned representations, and this training process identifies subtle\nfeatures which otherwise go undetected. This approach enables the design of\nmodels which require only routine images, but contain insights from\nstate-of-the-art data, allowing better use of the available resources.\n","authors":["Lucas Farndale","Robert Insall","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2303.10656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11706v1","updated":"2023-07-21T17:02:55Z","published":"2023-07-21T17:02:55Z","title":"3D Skeletonization of Complex Grapevines for Robotic Pruning","summary":" Robotic pruning of dormant grapevines is an area of active research in order\nto promote vine balance and grape quality, but so far robotic efforts have\nlargely focused on planar, simplified vines not representative of commercial\nvineyards. This paper aims to advance the robotic perception capabilities\nnecessary for pruning in denser and more complex vine structures by extending\nplant skeletonization techniques. The proposed pipeline generates skeletal\ngrapevine models that have lower reprojection error and higher connectivity\nthan baseline algorithms. We also show how 3D and skeletal information enables\nprediction accuracy of pruning weight for dense vines surpassing prior work,\nwhere pruning weight is an important vine metric influencing pruning site\nselection.\n","authors":["Eric Schneider","Sushanth Jayanth","Abhisesh Silwal","George Kantor"],"pdf_url":"https://arxiv.org/pdf/2307.11706v1.pdf","comment":"6 pages, IROS 2023 Computer Vision for Automation"},{"id":"http://arxiv.org/abs/2307.11702v1","updated":"2023-07-21T16:56:36Z","published":"2023-07-21T16:56:36Z","title":"SACReg: Scene-Agnostic Coordinate Regression for Visual Localization","summary":" Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every\npixel of a given image, has recently shown promising potential. However,\nexisting methods remain mostly scene-specific or limited to small scenes and\nthus hardly scale to realistic datasets. In this paper, we propose a new\nparadigm where a single generic SCR model is trained once to be then deployed\nto new test scenes, regardless of their scale and without further finetuning.\nFor a given query image, it collects inputs from off-the-shelf image retrieval\ntechniques and Structure-from-Motion databases: a list of relevant database\nimages with sparse pointwise 2D-3D annotations. The model is based on the\ntransformer architecture and can take a variable number of images and sparse\n2D-3D annotations as input. It is trained on a few diverse datasets and\nsignificantly outperforms other scene regression approaches on several\nbenchmarks, including scene-specific models, for visual localization. In\nparticular, we set a new state of the art on the Cambridge localization\nbenchmark, even outperforming feature-matching-based approaches.\n","authors":["Jerome Revaud","Yohann Cabon","Romain Brégier","JongMin Lee","Philippe Weinzaepfel"],"pdf_url":"https://arxiv.org/pdf/2307.11702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06828v3","updated":"2023-07-21T16:54:18Z","published":"2022-11-13T06:03:28Z","title":"Enhancing Few-shot Image Classification with Cosine Transformer","summary":" This paper addresses the few-shot image classification problem, where the\nclassification task is performed on unlabeled query samples given a small\namount of labeled support samples only. One major challenge of the few-shot\nlearning problem is the large variety of object visual appearances that\nprevents the support samples to represent that object comprehensively. This\nmight result in a significant difference between support and query samples,\ntherefore undermining the performance of few-shot algorithms. In this paper, we\ntackle the problem by proposing Few-shot Cosine Transformer (FS-CT), where the\nrelational map between supports and queries is effectively obtained for the\nfew-shot tasks. The FS-CT consists of two parts, a learnable prototypical\nembedding network to obtain categorical representations from support samples\nwith hard cases, and a transformer encoder to effectively achieve the\nrelational map from two different support and query samples. We introduce\nCosine Attention, a more robust and stable attention module that enhances the\ntransformer module significantly and therefore improves FS-CT performance from\n5% to over 20% in accuracy compared to the default scaled dot-product\nmechanism. Our method performs competitive results in mini-ImageNet, CUB-200,\nand CIFAR-FS on 1-shot learning and 5-shot learning tasks across backbones and\nfew-shot configurations. We also developed a custom few-shot dataset for Yoga\npose recognition to demonstrate the potential of our algorithm for practical\napplication. Our FS-CT with cosine attention is a lightweight, simple few-shot\nalgorithm that can be applied for a wide range of applications, such as\nhealthcare, medical, and security surveillance. The official implementation\ncode of our Few-shot Cosine Transformer is available at\nhttps://github.com/vinuni-vishc/Few-Shot-Cosine-Transformer\n","authors":["Quang-Huy Nguyen","Cuong Q. Nguyen","Dung D. Le","Hieu H. Pham"],"pdf_url":"https://arxiv.org/pdf/2211.06828v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11661v1","updated":"2023-07-21T15:49:59Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":" Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. We will release the code, prompts, and auxiliary text\ndataset upon acceptance.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v1.pdf","comment":"10 pages, Pre-print"},{"id":"http://arxiv.org/abs/2307.11654v1","updated":"2023-07-21T15:42:01Z","published":"2023-07-21T15:42:01Z","title":"FEDD -- Fair, Efficient, and Diverse Diffusion-based Lesion Segmentation\n and Malignancy Classification","summary":" Skin diseases affect millions of people worldwide, across all ethnicities.\nIncreasing diagnosis accessibility requires fair and accurate segmentation and\nclassification of dermatology images. However, the scarcity of annotated\nmedical images, especially for rare diseases and underrepresented skin tones,\nposes a challenge to the development of fair and accurate models. In this\nstudy, we introduce a Fair, Efficient, and Diverse Diffusion-based framework\nfor skin lesion segmentation and malignancy classification. FEDD leverages\nsemantically meaningful feature embeddings learned through a denoising\ndiffusion probabilistic backbone and processes them via linear probes to\nachieve state-of-the-art performance on Diverse Dermatology Images (DDI). We\nachieve an improvement in intersection over union of 0.18, 0.13, 0.06, and 0.07\nwhile using only 5%, 10%, 15%, and 20% labeled samples, respectively.\nAdditionally, FEDD trained on 10% of DDI demonstrates malignancy classification\naccuracy of 81%, 14% higher compared to the state-of-the-art. We showcase high\nefficiency in data-constrained scenarios while providing fair performance for\ndiverse skin tones and rare malignancy conditions. Our newly annotated DDI\nsegmentation masks and training code can be found on\nhttps://github.com/hectorcarrion/fedd.\n","authors":["Héctor Carrión","Narges Norouzi"],"pdf_url":"https://arxiv.org/pdf/2307.11654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11643v1","updated":"2023-07-21T15:22:32Z","published":"2023-07-21T15:22:32Z","title":"Morphological Image Analysis and Feature Extraction for Reasoning with\n AI-based Defect Detection and Classification Models","summary":" As the use of artificial intelligent (AI) models becomes more prevalent in\nindustries such as engineering and manufacturing, it is essential that these\nmodels provide transparent reasoning behind their predictions. This paper\nproposes the AI-Reasoner, which extracts the morphological characteristics of\ndefects (DefChars) from images and utilises decision trees to reason with the\nDefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.\ncharts) and textual explanations to provide insights into outputs made by\nmasked-based defect detection and classification models. It also provides\neffective mitigation strategies to enhance data pre-processing and overall\nmodel performance. The AI-Reasoner was tested on explaining the outputs of an\nIE Mask R-CNN model using a set of 366 images containing defects. The results\ndemonstrated its effectiveness in explaining the IE Mask R-CNN model's\npredictions. Overall, the proposed AI-Reasoner provides a solution for\nimproving the performance of AI models in industrial applications that require\ndefect analysis.\n","authors":["Jiajun Zhang","Georgina Cosma","Sarah Bugby","Axel Finke","Jason Watkins"],"pdf_url":"https://arxiv.org/pdf/2307.11643v1.pdf","comment":"8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series\n on computational intelligence (SSCI)"},{"id":"http://arxiv.org/abs/2307.11638v1","updated":"2023-07-21T15:04:21Z","published":"2023-07-21T15:04:21Z","title":"Deep Reinforcement Learning Based System for Intraoperative\n Hyperspectral Video Autofocusing","summary":" Hyperspectral imaging (HSI) captures a greater level of spectral detail than\ntraditional optical imaging, making it a potentially valuable intraoperative\ntool when precise tissue differentiation is essential. Hardware limitations of\ncurrent optical systems used for handheld real-time video HSI result in a\nlimited focal depth, thereby posing usability issues for integration of the\ntechnology into the operating room. This work integrates a focus-tunable liquid\nlens into a video HSI exoscope, and proposes novel video autofocusing methods\nbased on deep reinforcement learning. A first-of-its-kind robotic focal-time\nscan was performed to create a realistic and reproducible testing dataset. We\nbenchmarked our proposed autofocus algorithm against traditional policies, and\nfound our novel approach to perform significantly ($p<0.05$) better than\ntraditional techniques ($0.070\\pm.098$ mean absolute focal error compared to\n$0.146\\pm.148$). In addition, we performed a blinded usability trial by having\ntwo neurosurgeons compare the system with different autofocus policies, and\nfound our novel approach to be the most favourable, making our system a\ndesirable addition for intraoperative HSI.\n","authors":["Charlie Budd","Jianrong Qiu","Oscar MacCormac","Martin Huber","Christopher Mower","Mirek Janatka","Théo Trotouin","Jonathan Shapey","Mads S. Bergholt","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2307.11638v1.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11636v1","updated":"2023-07-21T14:58:44Z","published":"2023-07-21T14:58:44Z","title":"OxfordTVG-HIC: Can Machine Make Humorous Captions from Images?","summary":" This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale\ndataset for humour generation and understanding. Humour is an abstract,\nsubjective, and context-dependent cognitive construct involving several\ncognitive factors, making it a challenging task to generate and interpret.\nHence, humour generation and understanding can serve as a new task for\nevaluating the ability of deep-learning methods to process abstract and\nsubjective information. Due to the scarcity of data, humour-related generation\ntasks such as captioning remain under-explored. To address this gap,\nOxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to\ntrain a generalizable humour captioning model. Contrary to existing captioning\ndatasets, OxfordTVG-HIC features a wide range of emotional and semantic\ndiversity resulting in out-of-context examples that are particularly conducive\nto generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive\ncontent. We also show how OxfordTVG-HIC can be leveraged for evaluating the\nhumour of a generated text. Through explainability analysis of the trained\nmodels, we identify the visual and linguistic cues influential for evoking\nhumour prediction (and generation). We observe qualitatively that these cues\nare aligned with the benign violation theory of humour in cognitive psychology.\n","authors":["Runjia Li","Shuyang Sun","Mohamed Elhoseiny","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2307.11636v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.15823v2","updated":"2023-07-21T14:55:21Z","published":"2023-03-28T08:51:15Z","title":"Automated wildlife image classification: An active learning tool for\n ecological applications","summary":" Wildlife camera trap images are being used extensively to investigate animal\nabundance, habitat associations, and behavior, which is complicated by the fact\nthat experts must first classify the images manually. Artificial intelligence\nsystems can take over this task but usually need a large number of\nalready-labeled training images to achieve sufficient performance. This\nrequirement necessitates human expert labor and poses a particular challenge\nfor projects with few cameras or short durations. We propose a label-efficient\nlearning strategy that enables researchers with small or medium-sized image\ndatabases to leverage the potential of modern machine learning, thus freeing\ncrucial resources for subsequent analyses.\n Our methodological proposal is two-fold: (1) We improve current strategies of\ncombining object detection and image classification by tuning the\nhyperparameters of both models. (2) We provide an active learning (AL) system\nthat allows training deep learning models very efficiently in terms of required\nhuman-labeled training images. We supply a software package that enables\nresearchers to use these methods directly and thereby ensure the broad\napplicability of the proposed framework in ecological practice.\n We show that our tuning strategy improves predictive performance. We\ndemonstrate how the AL pipeline reduces the amount of pre-labeled data needed\nto achieve a specific predictive performance and that it is especially valuable\nfor improving out-of-sample predictive performance.\n We conclude that the combination of tuning and AL increases predictive\nperformance substantially. Furthermore, we argue that our work can broadly\nimpact the community through the ready-to-use software package provided.\nFinally, the publication of our models tailored to European wildlife data\nenriches existing model bases mostly trained on data from Africa and North\nAmerica.\n","authors":["Ludwig Bothmann","Lisa Wimmer","Omid Charrakh","Tobias Weber","Hendrik Edelhoff","Wibke Peters","Hien Nguyen","Caryl Benjamin","Annette Menzel"],"pdf_url":"https://arxiv.org/pdf/2303.15823v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03056v3","updated":"2023-07-21T14:45:20Z","published":"2023-03-06T11:59:13Z","title":"MOISST: Multimodal Optimization of Implicit Scene for SpatioTemporal\n calibration","summary":" With the recent advances in autonomous driving and the decreasing cost of\nLiDARs, the use of multimodal sensor systems is on the rise. However, in order\nto make use of the information provided by a variety of complimentary sensors,\nit is necessary to accurately calibrate them. We take advantage of recent\nadvances in computer graphics and implicit volumetric scene representation to\ntackle the problem of multi-sensor spatial and temporal calibration. Thanks to\na new formulation of the Neural Radiance Field (NeRF) optimization, we are able\nto jointly optimize calibration parameters along with scene representation\nbased on radiometric and geometric measurements. Our method enables accurate\nand robust calibration from data captured in uncontrolled and unstructured\nurban environments, making our solution more scalable than existing calibration\nsolutions. We demonstrate the accuracy and robustness of our method in urban\nscenes typically encountered in autonomous driving scenarios.\n","authors":["Quentin Herau","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2303.03056v3.pdf","comment":"Accepted at IROS2023 Project site: https://qherau.github.io/MOISST/"},{"id":"http://arxiv.org/abs/2307.11618v1","updated":"2023-07-21T14:37:17Z","published":"2023-07-21T14:37:17Z","title":"Divide and Adapt: Active Domain Adaptation via Customized Learning","summary":" Active domain adaptation (ADA) aims to improve the model adaptation\nperformance by incorporating active learning (AL) techniques to label a\nmaximally-informative subset of target samples. Conventional AL methods do not\nconsider the existence of domain shift, and hence, fail to identify the truly\nvaluable samples in the context of domain adaptation. To accommodate active\nlearning and domain adaption, the two naturally different tasks, in a\ncollaborative framework, we advocate that a customized learning strategy for\nthe target data is the key to the success of ADA solutions. We present\nDivide-and-Adapt (DiaNA), a new ADA framework that partitions the target\ninstances into four categories with stratified transferable properties. With a\nnovel data subdivision protocol based on uncertainty and domainness, DiaNA can\naccurately recognize the most gainful samples. While sending the informative\ninstances for annotation, DiaNA employs tailored learning strategies for the\nremaining categories. Furthermore, we propose an informativeness score that\nunifies the data partitioning criteria. This enables the use of a Gaussian\nmixture model (GMM) to automatically sample unlabeled data into the proposed\nfour categories. Thanks to the \"divideand-adapt\" spirit, DiaNA can handle data\nwith large variations of domain gap. In addition, we show that DiaNA can\ngeneralize to different domain adaptation settings, such as unsupervised domain\nadaptation (UDA), semi-supervised domain adaptation (SSDA), source-free domain\nadaptation (SFDA), etc.\n","authors":["Duojun Huang","Jichang Li","Weikai Chen","Junshi Huang","Zhenhua Chai","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11618v1.pdf","comment":"CVPR2023, Highlight paper"},{"id":"http://arxiv.org/abs/2307.11604v1","updated":"2023-07-21T14:14:29Z","published":"2023-07-21T14:14:29Z","title":"Consistency-guided Meta-Learning for Bootstrapping Semi-Supervised\n Medical Image Segmentation","summary":" Medical imaging has witnessed remarkable progress but usually requires a\nlarge amount of high-quality annotated data which is time-consuming and costly\nto obtain. To alleviate this burden, semi-supervised learning has garnered\nattention as a potential solution. In this paper, we present Meta-Learning for\nBootstrapping Medical Image Segmentation (MLB-Seg), a novel method for tackling\nthe challenge of semi-supervised medical image segmentation. Specifically, our\napproach first involves training a segmentation model on a small set of clean\nlabeled images to generate initial labels for unlabeled data. To further\noptimize this bootstrapping process, we introduce a per-pixel weight mapping\nsystem that dynamically assigns weights to both the initialized labels and the\nmodel's own predictions. These weights are determined using a meta-process that\nprioritizes pixels with loss gradient directions closer to those of clean data,\nwhich is based on a small set of precisely annotated images. To facilitate the\nmeta-learning process, we additionally introduce a consistency-based Pseudo\nLabel Enhancement (PLE) scheme that improves the quality of the model's own\npredictions by ensembling predictions from various augmented versions of the\nsame input. In order to improve the quality of the weight maps obtained through\nmultiple augmentations of a single input, we introduce a mean teacher into the\nPLE scheme. This method helps to reduce noise in the weight maps and stabilize\nits generation process. Our extensive experimental results on public atrial and\nprostate segmentation datasets demonstrate that our proposed method achieves\nstate-of-the-art results under semi-supervision. Our code is available at\nhttps://github.com/aijinrjinr/MLB-Seg.\n","authors":["Qingyue Wei","Lequan Yu","Xianhang Li","Wei Shao","Cihang Xie","Lei Xing","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11604v1.pdf","comment":"Accepted to MICCAI 2023. Code is publicly available at\n https://github.com/aijinrjinr/MLB-Seg"},{"id":"http://arxiv.org/abs/2307.11603v1","updated":"2023-07-21T14:12:28Z","published":"2023-07-21T14:12:28Z","title":"Cascaded multitask U-Net using topological loss for vessel segmentation\n and centerline extraction","summary":" Vessel segmentation and centerline extraction are two crucial preliminary\ntasks for many computer-aided diagnosis tools dealing with vascular diseases.\nRecently, deep-learning based methods have been widely applied to these tasks.\nHowever, classic deep-learning approaches struggle to capture the complex\ngeometry and specific topology of vascular networks, which is of the utmost\nimportance in most applications. To overcome these limitations, the clDice\nloss, a topological loss that focuses on the vessel centerlines, has been\nrecently proposed. This loss requires computing, with a proposed soft-skeleton\nalgorithm, the skeletons of both the ground truth and the predicted\nsegmentation. However, the soft-skeleton algorithm provides suboptimal results\non 3D images, which makes the clDice hardly suitable on 3D images. In this\npaper, we propose to replace the soft-skeleton algorithm by a U-Net which\ncomputes the vascular skeleton directly from the segmentation. We show that our\nmethod provides more accurate skeletons than the soft-skeleton algorithm. We\nthen build upon this network a cascaded U-Net trained with the clDice loss to\nembed topological constraints during the segmentation. The resulting model is\nable to predict both the vessel segmentation and centerlines with a more\naccurate topology.\n","authors":["Pierre Rougé","Nicolas Passat","Odyssée Merveille"],"pdf_url":"https://arxiv.org/pdf/2307.11603v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.18453v2","updated":"2023-07-21T13:26:21Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Medical Image Synthesis","summary":" The demand for artificial intelligence (AI) in healthcare is rapidly\nincreasing. However, significant challenges arise from data scarcity and\nprivacy concerns, particularly in medical imaging. While existing generative\nmodels have achieved success in image synthesis and image-to-image translation\ntasks, there remains a gap in the generation of 3D semantic medical images. To\naddress this gap, we introduce Med-DDPM, a diffusion model specifically\ndesigned for semantic 3D medical image synthesis, effectively tackling data\nscarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation\nof semantic conditioning, enabling precise control during the image generation\nprocess. Our model outperforms Generative Adversarial Networks (GANs) in terms\nof stability and performance, generating diverse and anatomically coherent\nimages with high visual fidelity. Comparative analysis against state-of-the-art\naugmentation techniques demonstrates that Med-DDPM produces comparable results,\nhighlighting its potential as a data augmentation tool for enhancing model\naccuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis\nby delivering high-quality and anatomically coherent images. Furthermore, the\nintegration of semantic conditioning with Med-DDPM holds promise for image\nanonymization in the field of biomedical imaging, showcasing the capabilities\nof the model in addressing challenges related to data scarcity and privacy\nconcerns.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11567v1","updated":"2023-07-21T13:18:43Z","published":"2023-07-21T13:18:43Z","title":"CortexMorph: fast cortical thickness estimation via diffeomorphic\n registration using VoxelMorph","summary":" The thickness of the cortical band is linked to various neurological and\npsychiatric conditions, and is often estimated through surface-based methods\nsuch as Freesurfer in MRI studies. The DiReCT method, which calculates cortical\nthickness using a diffeomorphic deformation of the gray-white matter interface\ntowards the pial surface, offers an alternative to surface-based methods.\nRecent studies using a synthetic cortical thickness phantom have demonstrated\nthat the combination of DiReCT and deep-learning-based segmentation is more\nsensitive to subvoxel cortical thinning than Freesurfer.\n While anatomical segmentation of a T1-weighted image now takes seconds,\nexisting implementations of DiReCT rely on iterative image registration methods\nwhich can take up to an hour per volume. On the other hand, learning-based\ndeformable image registration methods like VoxelMorph have been shown to be\nfaster than classical methods while improving registration accuracy. This paper\nproposes CortexMorph, a new method that employs unsupervised deep learning to\ndirectly regress the deformation field needed for DiReCT. By combining\nCortexMorph with a deep-learning-based segmentation model, it is possible to\nestimate region-wise thickness in seconds from a T1-weighted image, while\nmaintaining the ability to detect cortical atrophy. We validate this claim on\nthe OASIS-3 dataset and the synthetic cortical thickness phantom of Rusak et\nal.\n","authors":["Richard McKinley","Christian Rummel"],"pdf_url":"https://arxiv.org/pdf/2307.11567v1.pdf","comment":"Accepted (early acceptance) at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11558v1","updated":"2023-07-21T13:06:02Z","published":"2023-07-21T13:06:02Z","title":"Advancing Visual Grounding with Scene Knowledge: Benchmark and Method","summary":" Visual grounding (VG) aims to establish fine-grained alignment between vision\nand language. Ideally, it can be a testbed for vision-and-language models to\nevaluate their understanding of the images and texts and their reasoning\nabilities over their joint space. However, most existing VG datasets are\nconstructed using simple description texts, which do not require sufficient\nreasoning over the images and texts. This has been demonstrated in a recent\nstudy~\\cite{luo2022goes}, where a simple LSTM-based text encoder without\npretraining can achieve state-of-the-art performance on mainstream VG datasets.\nTherefore, in this paper, we propose a novel benchmark of \\underline{S}cene\n\\underline{K}nowledge-guided \\underline{V}isual \\underline{G}rounding (SK-VG),\nwhere the image content and referring expressions are not sufficient to ground\nthe target objects, forcing the models to have a reasoning ability on the\nlong-form scene knowledge. To perform this task, we propose two approaches to\naccept the triple-type input, where the former embeds knowledge into the image\nfeatures before the image-query interaction; the latter leverages linguistic\nstructure to assist in computing the image-text matching. We conduct extensive\nexperiments to analyze the above methods and show that the proposed approaches\nachieve promising results but still leave room for improvement, including\nperformance and interpretability. The dataset and code are available at\n\\url{https://github.com/zhjohnchan/SK-VG}.\n","authors":["Zhihong Chen","Ruifei Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11558v1.pdf","comment":"Computer Vision and Natural Language Processing. 21 pages, 14\n figures. CVPR-2023"},{"id":"http://arxiv.org/abs/2307.11550v1","updated":"2023-07-21T12:53:54Z","published":"2023-07-21T12:53:54Z","title":"YOLOPose V2: Understanding and Improving Transformer-based 6D Pose\n Estimation","summary":" 6D object pose estimation is a crucial prerequisite for autonomous robot\nmanipulation applications. The state-of-the-art models for pose estimation are\nconvolutional neural network (CNN)-based. Lately, Transformers, an architecture\noriginally proposed for natural language processing, is achieving\nstate-of-the-art results in many computer vision tasks as well. Equipped with\nthe multi-head self-attention mechanism, Transformers enable simple\nsingle-stage end-to-end architectures for learning object detection and 6D\nobject pose estimation jointly. In this work, we propose YOLOPose (short form\nfor You Only Look Once Pose estimation), a Transformer-based multi-object 6D\npose estimation method based on keypoint regression and an improved variant of\nthe YOLOPose model. In contrast to the standard heatmaps for predicting\nkeypoints in an image, we directly regress the keypoints. Additionally, we\nemploy a learnable orientation estimation module to predict the orientation\nfrom the keypoints. Along with a separate translation estimation module, our\nmodel is end-to-end differentiable. Our method is suitable for real-time\napplications and achieves results comparable to state-of-the-art methods. We\nanalyze the role of object queries in our architecture and reveal that the\nobject queries specialize in detecting objects in specific image regions.\nFurthermore, we quantify the accuracy trade-off of using datasets of smaller\nsizes to train our model.\n","authors":["Arul Selvam Periyasamy","Arash Amini","Vladimir Tsaturyan","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2307.11550v1.pdf","comment":"Robotics and Autonomous Systems Journal, Elsevier, to appear 2023.\n arXiv admin note: substantial text overlap with arXiv:2205.02536"},{"id":"http://arxiv.org/abs/2307.11545v1","updated":"2023-07-21T12:46:15Z","published":"2023-07-21T12:46:15Z","title":"Bridging Vision and Language Encoders: Parameter-Efficient Tuning for\n Referring Image Segmentation","summary":" Parameter Efficient Tuning (PET) has gained attention for reducing the number\nof parameters while maintaining performance and providing better hardware\nresource savings, but few studies investigate dense prediction tasks and\ninteraction between modalities. In this paper, we do an investigation of\nefficient tuning problems on referring image segmentation. We propose a novel\nadapter called Bridger to facilitate cross-modal information exchange and\ninject task-specific information into the pre-trained model. We also design a\nlightweight decoder for image segmentation. Our approach achieves comparable or\nsuperior performance with only 1.61\\% to 3.38\\% backbone parameter updates,\nevaluated on challenging benchmarks. The code is available at\n\\url{https://github.com/kkakkkka/ETRIS}.\n","authors":["Zunnan Xu","Zhihong Chen","Yong Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11545v1.pdf","comment":"Computer Vision and Natural Language Processing. 14 pages, 8 figures.\n ICCV-2023"},{"id":"http://arxiv.org/abs/2303.11057v3","updated":"2023-07-21T12:43:23Z","published":"2023-03-20T12:14:13Z","title":"Learning Foresightful Dense Visual Affordance for Deformable Object\n Manipulation","summary":" Understanding and manipulating deformable objects (e.g., ropes and fabrics)\nis an essential yet challenging task with broad applications. Difficulties come\nfrom complex states and dynamics, diverse configurations and high-dimensional\naction space of deformable objects. Besides, the manipulation tasks usually\nrequire multiple steps to accomplish, and greedy policies may easily lead to\nlocal optimal states. Existing studies usually tackle this problem using\nreinforcement learning or imitating expert demonstrations, with limitations in\nmodeling complex states or requiring hand-crafted expert policies. In this\npaper, we study deformable object manipulation using dense visual affordance,\nwith generalization towards diverse states, and propose a novel kind of\nforesightful dense affordance, which avoids local optima by estimating states'\nvalues for long-term manipulation. We propose a framework for learning this\nrepresentation, with novel designs such as multi-stage stable learning and\nefficient self-supervised data collection without experts. Experiments\ndemonstrate the superiority of our proposed foresightful dense affordance.\nProject page: https://hyperplane-lab.github.io/DeformableAffordance\n","authors":["Ruihai Wu","Chuanruo Ning","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2303.11057v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11543v1","updated":"2023-07-21T12:43:07Z","published":"2023-07-21T12:43:07Z","title":"KVN: Keypoints Voting Network with Differentiable RANSAC for Stereo Pose\n Estimation","summary":" Object pose estimation is a fundamental computer vision task exploited in\nseveral robotics and augmented reality applications. Many established\napproaches rely on predicting 2D-3D keypoint correspondences using RANSAC\n(Random sample consensus) and estimating the object pose using the PnP\n(Perspective-n-Point) algorithm. Being RANSAC non-differentiable,\ncorrespondences cannot be directly learned in an end-to-end fashion. In this\npaper, we address the stereo image-based object pose estimation problem by (i)\nintroducing a differentiable RANSAC layer into a well-known monocular pose\nestimation network; (ii) exploiting an uncertainty-driven multi-view PnP solver\nwhich can fuse information from multiple views. We evaluate our approach on a\nchallenging public stereo object pose estimation dataset, yielding\nstate-of-the-art results against other recent approaches. Furthermore, in our\nablation study, we show that the differentiable RANSAC layer plays a\nsignificant role in the accuracy of the proposed method. We release with this\npaper the open-source implementation of our method.\n","authors":["Ivano Donadi","Alberto Pretto"],"pdf_url":"https://arxiv.org/pdf/2307.11543v1.pdf","comment":"Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2307.11530v1","updated":"2023-07-21T12:23:39Z","published":"2023-07-21T12:23:39Z","title":"UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle\n Transformation Multi-scale GAN","summary":" Fundus photography is an essential examination for clinical and differential\ndiagnosis of fundus diseases. Recently, Ultra-Wide-angle Fundus (UWF)\ntechniques, UWF Fluorescein Angiography (UWF-FA) and UWF Scanning Laser\nOphthalmoscopy (UWF-SLO) have been gradually put into use. However, Fluorescein\nAngiography (FA) and UWF-FA require injecting sodium fluorescein which may have\ndetrimental influences. To avoid negative impacts, cross-modality medical image\ngeneration algorithms have been proposed. Nevertheless, current methods in\nfundus imaging could not produce high-resolution images and are unable to\ncapture tiny vascular lesion areas. This paper proposes a novel conditional\ngenerative adversarial network (UWAT-GAN) to synthesize UWF-FA from UWF-SLO.\nUsing multi-scale generators and a fusion module patch to better extract global\nand local information, our model can generate high-resolution images. Moreover,\nan attention transmit module is proposed to help the decoder learn effectively.\nBesides, a supervised approach is used to train the network using multiple new\nweighted losses on different scales of data. Experiments on an in-house UWF\nimage dataset demonstrate the superiority of the UWAT-GAN over the\nstate-of-the-art methods. The source code is available at:\nhttps://github.com/Tinysqua/UWAT-GAN.\n","authors":["Zhaojie Fang","Zhanghao Chen","Pengxue Wei","Wangting Li","Shaochong Zhang","Ahmed Elazab","Gangyong Jia","Ruiquan Ge","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11530v1.pdf","comment":"26th International Conference on Medical Image Computing and Computer\n Assisted Intervention"},{"id":"http://arxiv.org/abs/2307.11528v1","updated":"2023-07-21T12:18:35Z","published":"2023-07-21T12:18:35Z","title":"Improving Viewpoint Robustness for Visual Recognition via Adversarial\n Training","summary":" Viewpoint invariance remains challenging for visual recognition in the 3D\nworld, as altering the viewing directions can significantly impact predictions\nfor the same object. While substantial efforts have been dedicated to making\nneural networks invariant to 2D image translations and rotations, viewpoint\ninvariance is rarely investigated. Motivated by the success of adversarial\ntraining in enhancing model robustness, we propose Viewpoint-Invariant\nAdversarial Training (VIAT) to improve the viewpoint robustness of image\nclassifiers. Regarding viewpoint transformation as an attack, we formulate VIAT\nas a minimax optimization problem, where the inner maximization characterizes\ndiverse adversarial viewpoints by learning a Gaussian mixture distribution\nbased on the proposed attack method GMVFool. The outer minimization obtains a\nviewpoint-invariant classifier by minimizing the expected loss over the\nworst-case viewpoint distributions that can share the same one for different\nobjects within the same category. Based on GMVFool, we contribute a large-scale\ndataset called ImageNet-V+ to benchmark viewpoint robustness. Experimental\nresults show that VIAT significantly improves the viewpoint robustness of\nvarious image classifiers based on the diversity of adversarial viewpoints\ngenerated by GMVFool. Furthermore, we propose ViewRS, a certified viewpoint\nrobustness method that provides a certified radius and accuracy to demonstrate\nthe effectiveness of VIAT from the theoretical perspective.\n","authors":["Shouwei Ruan","Yinpeng Dong","Hang Su","Jianteng Peng","Ning Chen","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2307.11528v1.pdf","comment":"14 pages, 12 figures. arXiv admin note: substantial text overlap with\n arXiv:2307.10235"},{"id":"http://arxiv.org/abs/2303.11630v2","updated":"2023-07-21T12:15:41Z","published":"2023-03-21T06:54:18Z","title":"BoxSnake: Polygonal Instance Segmentation with Box Supervision","summary":" Box-supervised instance segmentation has gained much attention as it requires\nonly simple box annotations instead of costly mask or polygon annotations.\nHowever, existing box-supervised instance segmentation models mainly focus on\nmask-based frameworks. We propose a new end-to-end training technique, termed\nBoxSnake, to achieve effective polygonal instance segmentation using only box\nannotations for the first time. Our method consists of two loss functions: (1)\na point-based unary loss that constrains the bounding box of predicted polygons\nto achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss\nthat encourages the predicted polygons to fit the object boundaries. Compared\nwith the mask-based weakly-supervised methods, BoxSnake further reduces the\nperformance gap between the predicted segmentation and the bounding box, and\nshows significant superiority on the Cityscapes dataset.\n","authors":["Rui Yang","Lin Song","Yixiao Ge","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2303.11630v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.06666v2","updated":"2023-07-21T12:15:16Z","published":"2023-07-13T10:19:04Z","title":"Transformer-based end-to-end classification of variable-length\n volumetric data","summary":" The automatic classification of 3D medical data is memory-intensive. Also,\nvariations in the number of slices between samples is common. Na\\\"ive solutions\nsuch as subsampling can solve these problems, but at the cost of potentially\neliminating relevant diagnosis information. Transformers have shown promising\nperformance for sequential data analysis. However, their application for long\nsequences is data, computationally, and memory demanding. In this paper, we\npropose an end-to-end Transformer-based framework that allows to classify\nvolumetric data of variable length in an efficient fashion. Particularly, by\nrandomizing the input volume-wise resolution(#slices) during training, we\nenhance the capacity of the learnable positional embedding assigned to each\nvolume slice. Consequently, the accumulated positional information in each\npositional embedding can be generalized to the neighbouring slices, even for\nhigh-resolution volumes at the test time. By doing so, the model will be more\nrobust to variable volume length and amenable to different computational\nbudgets. We evaluated the proposed approach in retinal OCT volume\nclassification and achieved 21.96% average improvement in balanced accuracy on\na 9-class diagnostic task, compared to state-of-the-art video transformers. Our\nfindings show that varying the volume-wise resolution of the input during\ntraining results in more informative volume representation as compared to\ntraining with fixed number of slices per volume.\n","authors":["Marzieh Oghbaie","Teresa Araujo","Taha Emre","Ursula Schmidt-Erfurth","Hrvoje Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2307.06666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11526v1","updated":"2023-07-21T12:14:33Z","published":"2023-07-21T12:14:33Z","title":"CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields","summary":" Neural Radiance Fields (NeRF) have the potential to be a major representation\nof media. Since training a NeRF has never been an easy task, the protection of\nits model copyright should be a priority. In this paper, by analyzing the pros\nand cons of possible copyright protection solutions, we propose to protect the\ncopyright of NeRF models by replacing the original color representation in NeRF\nwith a watermarked color representation. Then, a distortion-resistant rendering\nscheme is designed to guarantee robust message extraction in 2D renderings of\nNeRF. Our proposed method can directly protect the copyright of NeRF models\nwhile maintaining high rendering quality and bit accuracy when compared among\noptional solutions.\n","authors":["Ziyuan Luo","Qing Guo","Ka Chun Cheung","Simon See","Renjie Wan"],"pdf_url":"https://arxiv.org/pdf/2307.11526v1.pdf","comment":"11 pages, 6 figures, accepted by iccv 2023 non-camera-ready version"},{"id":"http://arxiv.org/abs/2304.14133v2","updated":"2023-07-21T12:06:17Z","published":"2023-04-27T12:28:29Z","title":"VERITE: A Robust Benchmark for Multimodal Misinformation Detection\n Accounting for Unimodal Bias","summary":" Multimedia content has become ubiquitous on social media platforms, leading\nto the rise of multimodal misinformation (MM) and the urgent need for effective\nstrategies to detect and prevent its spread. In recent years, the challenge of\nmultimodal misinformation detection (MMD) has garnered significant attention by\nresearchers and has mainly involved the creation of annotated, weakly\nannotated, or synthetically generated training datasets, along with the\ndevelopment of various deep learning MMD models. However, the problem of\nunimodal bias in MMD benchmarks -- where biased or unimodal methods outperform\ntheir multimodal counterparts on an inherently multimodal task -- has been\noverlooked. In this study, we systematically investigate and identify the\npresence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS),\nraising concerns about their suitability for reliable evaluation. To address\nthis issue, we introduce the \"VERification of Image-TExtpairs\" (VERITE)\nbenchmark for MMD which incorporates real-world data, excludes \"asymmetric\nmultimodal misinformation\" and utilizes \"modality balancing\". We conduct an\nextensive comparative study with a Transformer-based architecture that shows\nthe ability of VERITE to effectively address unimodal bias, rendering it a\nrobust evaluation framework for MMD. Furthermore, we introduce a new method --\ntermed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating\nrealistic synthetic training data that preserve crossmodal relations between\nlegitimate images and false human-written captions. By leveraging CHASMA in the\ntraining process, we observe consistent and notable improvements in predictive\nperformance on VERITE; with a 9.2% increase in accuracy. We release our code\nat: https://github.com/stevejpapad/image-text-verification\n","authors":["Stefanos-Iordanis Papadopoulos","Christos Koutlis","Symeon Papadopoulos","Panagiotis C. Petrantonakis"],"pdf_url":"https://arxiv.org/pdf/2304.14133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11518v1","updated":"2023-07-21T12:03:39Z","published":"2023-07-21T12:03:39Z","title":"BatMobility: Towards Flying Without Seeing for Autonomous Drones","summary":" Unmanned aerial vehicles (UAVs) rely on optical sensors such as cameras and\nlidar for autonomous operation. However, such optical sensors are error-prone\nin bad lighting, inclement weather conditions including fog and smoke, and\naround textureless or transparent surfaces. In this paper, we ask: is it\npossible to fly UAVs without relying on optical sensors, i.e., can UAVs fly\nwithout seeing? We present BatMobility, a lightweight mmWave radar-only\nperception system for UAVs that eliminates the need for optical sensors.\nBatMobility enables two core functionalities for UAVs -- radio flow estimation\n(a novel FMCW radar-based alternative for optical flow based on\nsurface-parallel doppler shift) and radar-based collision avoidance. We build\nBatMobility using commodity sensors and deploy it as a real-time system on a\nsmall off-the-shelf quadcopter running an unmodified flight controller. Our\nevaluation shows that BatMobility achieves comparable or better performance\nthan commercial-grade optical sensors across a wide range of scenarios.\n","authors":["Emerson Sie","Zikun Liu","Deepak Vasisht"],"pdf_url":"https://arxiv.org/pdf/2307.11518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07308v3","updated":"2023-07-21T11:52:28Z","published":"2023-06-12T13:48:37Z","title":"Self-Supervised Hyperspectral Inpainting with the Optimisation inspired\n Deep Neural Network Prior","summary":" Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral\nbands, conveying a wealth of spatial and spectral information. However, due to\nthe instrumental errors and the atmospheric changes, the HSI obtained in\npractice are often contaminated by noise and dead pixels(lines), resulting in\nmissing information that may severely compromise the subsequent applications.\nWe introduce here a novel HSI missing pixel prediction algorithm, called Low\nRank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP\nis able to predict missing pixels and bands even when all spectral bands of the\nimage are missing. The proposed LRS-PnP algorithm is further extended to a\nself-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP),\ncalled LRS-PnP-DIP. In a series of experiments with real data, It is shown that\nthe LRS-PnP-DIP either achieves state-of-the-art inpainting performance\ncompared to other learning-based methods, or outperforms them.\n","authors":["Shuo Li","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2306.07308v3.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2208.05788v2","updated":"2023-07-21T11:50:11Z","published":"2022-08-10T12:29:01Z","title":"Semantic Self-adaptation: Enhancing Generalization with a Single Sample","summary":" The lack of out-of-domain generalization is a critical weakness of deep\nnetworks for semantic segmentation. Previous studies relied on the assumption\nof a static model, i. e., once the training process is complete, model\nparameters remain fixed at test time. In this work, we challenge this premise\nwith a self-adaptive approach for semantic segmentation that adjusts the\ninference process to each input sample. Self-adaptation operates on two levels.\nFirst, it fine-tunes the parameters of convolutional layers to the input image\nusing consistency regularization. Second, in Batch Normalization layers,\nself-adaptation interpolates between the training and the reference\ndistribution derived from a single test sample. Despite both techniques being\nwell known in the literature, their combination sets new state-of-the-art\naccuracy on synthetic-to-real generalization benchmarks. Our empirical study\nsuggests that self-adaptation may complement the established practice of model\nregularization at training time for improving deep network generalization to\nout-of-domain data. Our code and pre-trained models are available at\nhttps://github.com/visinf/self-adaptive.\n","authors":["Sherwin Bahmani","Oliver Hahn","Eduard Zamfir","Nikita Araslanov","Daniel Cremers","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2208.05788v2.pdf","comment":"Published in TMLR (July 2023); OpenReview:\n https://openreview.net/forum?id=ILNqQhGbLx; Code:\n https://github.com/visinf/self-adaptive; Video: https://youtu.be/s4DG65ic0EA"},{"id":"http://arxiv.org/abs/2307.11514v1","updated":"2023-07-21T11:50:05Z","published":"2023-07-21T11:50:05Z","title":"CORE: Cooperative Reconstruction for Multi-Agent Perception","summary":" This paper presents CORE, a conceptually simple, effective and\ncommunication-efficient model for multi-agent cooperative perception. It\naddresses the task from a novel perspective of cooperative reconstruction,\nbased on two key insights: 1) cooperating agents together provide a more\nholistic observation of the environment, and 2) the holistic observation can\nserve as valuable supervision to explicitly guide the model learning how to\nreconstruct the ideal observation based on collaboration. CORE instantiates the\nidea with three major components: a compressor for each agent to create more\ncompact feature representation for efficient broadcasting, a lightweight\nattentive collaboration component for cross-agent message aggregation, and a\nreconstruction module to reconstruct the observation based on aggregated\nfeature representations. This learning-to-reconstruct idea is task-agnostic,\nand offers clear and reasonable supervision to inspire more effective\ncollaboration, eventually promoting perception tasks. We validate CORE on\nOPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D\nobject detection and semantic segmentation. Results demonstrate that the model\nachieves state-of-the-art performance on both tasks, and is more\ncommunication-efficient.\n","authors":["Binglu Wang","Lei Zhang","Zhaozhong Wang","Yongqiang Zhao","Tianfei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11513v1","updated":"2023-07-21T11:49:30Z","published":"2023-07-21T11:49:30Z","title":"Bone mineral density estimation from a plain X-ray image by learning\n decomposition into projections of bone-segmented computed tomography","summary":" Osteoporosis is a prevalent bone disease that causes fractures in fragile\nbones, leading to a decline in daily living activities. Dual-energy X-ray\nabsorptiometry (DXA) and quantitative computed tomography (QCT) are highly\naccurate for diagnosing osteoporosis; however, these modalities require special\nequipment and scan protocols. To frequently monitor bone health, low-cost,\nlow-dose, and ubiquitously available diagnostic methods are highly anticipated.\nIn this study, we aim to perform bone mineral density (BMD) estimation from a\nplain X-ray image for opportunistic screening, which is potentially useful for\nearly diagnosis. Existing methods have used multi-stage approaches consisting\nof extraction of the region of interest and simple regression to estimate BMD,\nwhich require a large amount of training data. Therefore, we propose an\nefficient method that learns decomposition into projections of bone-segmented\nQCT for BMD estimation under limited datasets. The proposed method achieved\nhigh accuracy in BMD estimation, where Pearson correlation coefficients of\n0.880 and 0.920 were observed for DXA-measured BMD and QCT-measured BMD\nestimation tasks, respectively, and the root mean square of the coefficient of\nvariation values were 3.27 to 3.79% for four measurements with different poses.\nFurthermore, we conducted extensive validation experiments, including\nmulti-pose, uncalibrated-CT, and compression experiments toward actual\napplication in routine clinical practice.\n","authors":["Yi Gu","Yoshito Otake","Keisuke Uemura","Mazen Soufi","Masaki Takao","Hugues Talbot","Seiji Okada","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2307.11513v1.pdf","comment":"20 pages and 22 figures"},{"id":"http://arxiv.org/abs/2305.19920v2","updated":"2023-07-21T11:27:30Z","published":"2023-05-31T14:56:18Z","title":"MSKdeX: Musculoskeletal (MSK) decomposition from an X-ray image for\n fine-grained estimation of lean muscle mass and muscle volume","summary":" Musculoskeletal diseases such as sarcopenia and osteoporosis are major\nobstacles to health during aging. Although dual-energy X-ray absorptiometry\n(DXA) and computed tomography (CT) can be used to evaluate musculoskeletal\nconditions, frequent monitoring is difficult due to the cost and accessibility\n(as well as high radiation exposure in the case of CT). We propose a method\n(named MSKdeX) to estimate fine-grained muscle properties from a plain X-ray\nimage, a low-cost, low-radiation, and highly accessible imaging modality,\nthrough musculoskeletal decomposition leveraging fine-grained segmentation in\nCT. We train a multi-channel quantitative image translation model to decompose\nan X-ray image into projections of CT of individual muscles to infer the lean\nmuscle mass and muscle volume. We propose the object-wise intensity-sum loss, a\nsimple yet surprisingly effective metric invariant to muscle deformation and\nprojection direction, utilizing information in CT and X-ray images collected\nfrom the same patient. While our method is basically an unpaired image-to-image\ntranslation, we also exploit the nature of the bone's rigidity, which provides\nthe paired data through 2D-3D rigid registration, adding strong pixel-wise\nsupervision in unpaired training. Through the evaluation using a 539-patient\ndataset, we showed that the proposed method significantly outperformed\nconventional methods. The average Pearson correlation coefficient between the\npredicted and CT-derived ground truth metrics was increased from 0.460 to\n0.863. We believe our method opened up a new musculoskeletal diagnosis method\nand has the potential to be extended to broader applications in multi-channel\nquantitative image translation tasks. Our source code will be released soon.\n","authors":["Yi Gu","Yoshito Otake","Keisuke Uemura","Masaki Takao","Mazen Soufi","Yuta Hiasa","Hugues Talbot","Seiji Okata","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2305.19920v2.pdf","comment":"MICCAI 2023 early acceptance (12 pages and 6 figures)"},{"id":"http://arxiv.org/abs/2306.00988v2","updated":"2023-07-21T11:27:10Z","published":"2023-06-01T17:59:57Z","title":"Continual Learning for Abdominal Multi-Organ and Tumor Segmentation","summary":" The ability to dynamically extend a model to new data and classes is critical\nfor multiple organ and tumor segmentation. However, due to privacy regulations,\naccessing previous data and annotations can be problematic in the medical\ndomain. This poses a significant barrier to preserving the high segmentation\naccuracy of the old classes when learning from new classes because of the\ncatastrophic forgetting problem. In this paper, we first empirically\ndemonstrate that simply using high-quality pseudo labels can fairly mitigate\nthis problem in the setting of organ segmentation. Furthermore, we put forward\nan innovative architecture designed specifically for continuous organ and tumor\nsegmentation, which incurs minimal computational overhead. Our proposed design\ninvolves replacing the conventional output layer with a suite of lightweight,\nclass-specific heads, thereby offering the flexibility to accommodate newly\nemerging classes. These heads enable independent predictions for newly\nintroduced and previously learned classes, effectively minimizing the impact of\nnew classes on old ones during the course of continual learning. We further\npropose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings\ninto the organ-specific heads. These embeddings encapsulate the semantic\ninformation of each class, informed by extensive image-text co-training. The\nproposed method is evaluated on both in-house and public abdominal CT datasets\nunder organ and tumor segmentation tasks. Empirical results suggest that the\nproposed design improves the segmentation performance of a baseline neural\nnetwork on newly-introduced and previously-learned classes along the learning\ntrajectory.\n","authors":["Yixiao Zhang","Xinyi Li","Huimiao Chen","Alan Yuille","Yaoyao Liu","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.00988v2.pdf","comment":"MICCAI-2023"},{"id":"http://arxiv.org/abs/2303.05966v2","updated":"2023-07-21T11:21:30Z","published":"2023-03-10T14:55:35Z","title":"Score-Based Generative Models for Medical Image Segmentation using\n Signed Distance Functions","summary":" Medical image segmentation is a crucial task that relies on the ability to\naccurately identify and isolate regions of interest in medical images. Thereby,\ngenerative approaches allow to capture the statistical properties of\nsegmentation masks that are dependent on the respective structures. In this\nwork we propose a conditional score-based generative modeling framework to\nrepresent the signed distance function (SDF) leading to an implicit\ndistribution of segmentation masks. The advantage of leveraging the SDF is a\nmore natural distortion when compared to that of binary masks. By learning the\nscore function of the conditional distribution of SDFs we can accurately sample\nfrom the distribution of segmentation masks, allowing for the evaluation of\nstatistical quantities. Thus, this probabilistic representation allows for the\ngeneration of uncertainty maps represented by the variance, which can aid in\nfurther analysis and enhance the predictive robustness. We qualitatively and\nquantitatively illustrate competitive performance of the proposed method on a\npublic nuclei and gland segmentation data set, highlighting its potential\nutility in medical image segmentation applications.\n","authors":["Lea Bogensperger","Dominik Narnhofer","Filip Ilic","Thomas Pock"],"pdf_url":"https://arxiv.org/pdf/2303.05966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11482v1","updated":"2023-07-21T10:36:05Z","published":"2023-07-21T10:36:05Z","title":"Redemption from Range-view for Accurate 3D Object Detection","summary":" Most recent approaches for 3D object detection predominantly rely on\npoint-view or bird's-eye view representations, with limited exploration of\nrange-view-based methods. The range-view representation suffers from scale\nvariation and surface texture deficiency, both of which pose significant\nlimitations for developing corresponding methods. Notably, the surface texture\nloss problem has been largely ignored by all existing methods, despite its\nsignificant impact on the accuracy of range-view-based 3D object detection. In\nthis study, we propose Redemption from Range-view R-CNN (R2 R-CNN), a novel and\naccurate approach that comprehensively explores the range-view representation.\nOur proposed method addresses scale variation through the HD Meta Kernel, which\ncaptures range-view geometry information in multiple scales. Additionally, we\nintroduce Feature Points Redemption (FPR) to recover the lost 3D surface\ntexture information from the range view, and Synchronous-Grid RoI Pooling\n(S-Grid RoI Pooling), a multi-scaled approach with multiple receptive fields\nfor accurate box refinement. Our R2 R-CNN outperforms existing range-view-based\nmethods, achieving state-of-the-art performance on both the KITTI benchmark and\nthe Waymo Open Dataset. Our study highlights the critical importance of\naddressing the surface texture loss problem for accurate 3D object detection in\nrange-view-based methods. Codes will be made publicly available.\n","authors":["Yihan Wang","Qiao Yan"],"pdf_url":"https://arxiv.org/pdf/2307.11482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11477v1","updated":"2023-07-21T10:28:19Z","published":"2023-07-21T10:28:19Z","title":"SA-BEV: Generating Semantic-Aware Bird's-Eye-View Feature for Multi-view\n 3D Object Detection","summary":" Recently, the pure camera-based Bird's-Eye-View (BEV) perception provides a\nfeasible solution for economical autonomous driving. However, the existing\nBEV-based multi-view 3D detectors generally transform all image features into\nBEV features, without considering the problem that the large proportion of\nbackground information may submerge the object information. In this paper, we\npropose Semantic-Aware BEV Pooling (SA-BEVPool), which can filter out\nbackground information according to the semantic segmentation of image features\nand transform image features into semantic-aware BEV features. Accordingly, we\npropose BEV-Paste, an effective data augmentation strategy that closely matches\nwith semantic-aware BEV feature. In addition, we design a Multi-Scale\nCross-Task (MSCT) head, which combines task-specific and cross-task information\nto predict depth distribution and semantic segmentation more accurately,\nfurther improving the quality of semantic-aware BEV feature. Finally, we\nintegrate the above modules into a novel multi-view 3D object detection\nframework, namely SA-BEV. Experiments on nuScenes show that SA-BEV achieves\nstate-of-the-art performance. Code has been available at\nhttps://github.com/mengtan00/SA-BEV.git.\n","authors":["Jinqing Zhang","Yanan Zhang","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11471v1","updated":"2023-07-21T10:12:09Z","published":"2023-07-21T10:12:09Z","title":"Robust Visual Question Answering: Datasets, Methods, and Future\n Challenges","summary":" Visual question answering requires a system to provide an accurate natural\nlanguage answer given an image and a natural language question. However, it is\nwidely recognized that previous generic VQA methods often exhibit a tendency to\nmemorize biases present in the training data rather than learning proper\nbehaviors, such as grounding images before predicting answers. Therefore, these\nmethods usually achieve high in-distribution but poor out-of-distribution\nperformance. In recent years, various datasets and debiasing methods have been\nproposed to evaluate and enhance the VQA robustness, respectively. This paper\nprovides the first comprehensive survey focused on this emerging fashion.\nSpecifically, we first provide an overview of the development process of\ndatasets from in-distribution and out-of-distribution perspectives. Then, we\nexamine the evaluation metrics employed by these datasets. Thirdly, we propose\na typology that presents the development process, similarities and differences,\nrobustness comparison, and technical features of existing debiasing methods.\nFurthermore, we analyze and discuss the robustness of representative\nvision-and-language pre-training models on VQA. Finally, through a thorough\nreview of the available literature and experimental analysis, we discuss the\nkey areas for future research from various viewpoints.\n","authors":["Jie Ma","Pinghui Wang","Dechen Kong","Zewei Wang","Jun Liu","Hongbin Pei","Junzhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.11471v1.pdf","comment":"IEEE TPAMI (Under Review)"},{"id":"http://arxiv.org/abs/2307.11470v1","updated":"2023-07-21T10:10:18Z","published":"2023-07-21T10:10:18Z","title":"Physics-Aware Semi-Supervised Underwater Image Enhancement","summary":" Underwater images normally suffer from degradation due to the transmission\nmedium of water bodies. Both traditional prior-based approaches and deep\nlearning-based methods have been used to address this problem. However, the\ninflexible assumption of the former often impairs their effectiveness in\nhandling diverse underwater scenes, while the generalization of the latter to\nunseen images is usually weakened by insufficient data. In this study, we\nleverage both the physics-based underwater Image Formation Model (IFM) and deep\nlearning techniques for Underwater Image Enhancement (UIE). To this end, we\npropose a novel Physics-Aware Dual-Stream Underwater Image Enhancement Network,\ni.e., PA-UIENet, which comprises a Transmission Estimation Steam (T-Stream) and\nan Ambient Light Estimation Stream (A-Stream). This network fulfills the UIE\ntask by explicitly estimating the degradation parameters of the IFM. We also\nadopt an IFM-inspired semi-supervised learning framework, which exploits both\nthe labeled and unlabeled images, to address the issue of insufficient data.\nOur method performs better than, or at least comparably to, eight baselines\nacross five testing sets in the degradation estimation and UIE tasks. This\nshould be due to the fact that it not only can model the degradation but also\ncan learn the characteristics of diverse underwater scenes.\n","authors":["Hao Qi","Xinghui Dong"],"pdf_url":"https://arxiv.org/pdf/2307.11470v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.11469v1","updated":"2023-07-21T10:08:58Z","published":"2023-07-21T10:08:58Z","title":"Distribution Shift Matters for Knowledge Distillation with Webly\n Collected Images","summary":" Knowledge distillation aims to learn a lightweight student network from a\npre-trained teacher network. In practice, existing knowledge distillation\nmethods are usually infeasible when the original training data is unavailable\ndue to some privacy issues and data management considerations. Therefore,\ndata-free knowledge distillation approaches proposed to collect training\ninstances from the Internet. However, most of them have ignored the common\ndistribution shift between the instances from original training data and webly\ncollected data, affecting the reliability of the trained student network. To\nsolve this problem, we propose a novel method dubbed ``Knowledge Distillation\nbetween Different Distributions\" (KD$^{3}$), which consists of three\ncomponents. Specifically, we first dynamically select useful training instances\nfrom the webly collected data according to the combined predictions of teacher\nnetwork and student network. Subsequently, we align both the weighted features\nand classifier parameters of the two networks for knowledge memorization.\nMeanwhile, we also build a new contrastive learning block called\nMixDistribution to generate perturbed data with a new distribution for instance\nalignment, so that the student network can further learn a\ndistribution-invariant representation. Intensive experiments on various\nbenchmark datasets demonstrate that our proposed KD$^{3}$ can outperform the\nstate-of-the-art data-free knowledge distillation approaches.\n","authors":["Jialiang Tang","Shuo Chen","Gang Niu","Masashi Sugiyama","Chen Gong"],"pdf_url":"https://arxiv.org/pdf/2307.11469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11466v1","updated":"2023-07-21T10:02:02Z","published":"2023-07-21T10:02:02Z","title":"MatSpectNet: Material Segmentation Network with Domain-Aware and\n Physically-Constrained Hyperspectral Reconstruction","summary":" Achieving accurate material segmentation for 3-channel RGB images is\nchallenging due to the considerable variation in a material's appearance.\nHyperspectral images, which are sets of spectral measurements sampled at\nmultiple wavelengths, theoretically offer distinct information for material\nidentification, as variations in intensity of electromagnetic radiation\nreflected by a surface depend on the material composition of a scene. However,\nexisting hyperspectral datasets are impoverished regarding the number of images\nand material categories for the dense material segmentation task, and\ncollecting and annotating hyperspectral images with a spectral camera is\nprohibitively expensive. To address this, we propose a new model, the\nMatSpectNet to segment materials with recovered hyperspectral images from RGB\nimages. The network leverages the principles of colour perception in modern\ncameras to constrain the reconstructed hyperspectral images and employs the\ndomain adaptation method to generalise the hyperspectral reconstruction\ncapability from a spectral recovery dataset to material segmentation datasets.\nThe reconstructed hyperspectral images are further filtered using learned\nresponse curves and enhanced with human perception. The performance of\nMatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces\ndataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase\nin average pixel accuracy and a 3.42% improvement in mean class accuracy\ncompared with the most recent publication. The project code is attached to the\nsupplementary material and will be published on GitHub.\n","authors":["Yuwen Heng","Yihong Wu","Jiawen Chen","Srinandan Dasmahapatra","Hansung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.11466v1.pdf","comment":"7 pages main content"},{"id":"http://arxiv.org/abs/2210.09563v2","updated":"2023-07-21T10:01:25Z","published":"2022-10-18T03:32:18Z","title":"FedForgery: Generalized Face Forgery Detection with Residual Federated\n Learning","summary":" With the continuous development of deep learning in the field of image\ngeneration models, a large number of vivid forged faces have been generated and\nspread on the Internet. These high-authenticity artifacts could grow into a\nthreat to society security. Existing face forgery detection methods directly\nutilize the obtained public shared or centralized data for training but ignore\nthe personal privacy and security issues when personal data couldn't be\ncentralizedly shared in real-world scenarios. Additionally, different\ndistributions caused by diverse artifact types would further bring adverse\ninfluences on the forgery detection task. To solve the mentioned problems, the\npaper proposes a novel generalized residual Federated learning for face Forgery\ndetection (FedForgery). The designed variational autoencoder aims to learn\nrobust discriminative residual feature maps to detect forgery faces (with\ndiverse or even unknown artifact types). Furthermore, the general federated\nlearning strategy is introduced to construct distributed detection model\ntrained collaboratively with multiple local decentralized devices, which could\nfurther boost the representation generalization. Experiments conducted on\npublicly available face forgery detection datasets prove the superior\nperformance of the proposed FedForgery. The designed novel generalized face\nforgery detection protocols and source code would be publicly available.\n","authors":["Decheng Liu","Zhan Dang","Chunlei Peng","Yu Zheng","Shuang Li","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2210.09563v2.pdf","comment":"The code is available at https://github.com/GANG370/FedForgery. The\n paper has been accepted in the IEEE Transactions on Information Forensics &\n Security"},{"id":"http://arxiv.org/abs/2307.10926v2","updated":"2023-07-21T09:47:01Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n segmentation","summary":" Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquaux","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.11458v1","updated":"2023-07-21T09:40:42Z","published":"2023-07-21T09:40:42Z","title":"Strip-MLP: Efficient Token Interaction for Vision MLP","summary":" Token interaction operation is one of the core modules in MLP-based models to\nexchange and aggregate information between different spatial locations.\nHowever, the power of token interaction on the spatial dimension is highly\ndependent on the spatial resolution of the feature maps, which limits the\nmodel's expressive ability, especially in deep layers where the feature are\ndown-sampled to a small spatial size. To address this issue, we present a novel\nmethod called \\textbf{Strip-MLP} to enrich the token interaction power in three\nways. Firstly, we introduce a new MLP paradigm called Strip MLP layer that\nallows the token to interact with other tokens in a cross-strip manner,\nenabling the tokens in a row (or column) to contribute to the information\naggregations in adjacent but different strips of rows (or columns). Secondly, a\n\\textbf{C}ascade \\textbf{G}roup \\textbf{S}trip \\textbf{M}ixing \\textbf{M}odule\n(CGSMM) is proposed to overcome the performance degradation caused by small\nspatial feature size. The module allows tokens to interact more effectively in\nthe manners of within-patch and cross-patch, which is independent to the\nfeature spatial size. Finally, based on the Strip MLP layer, we propose a novel\n\\textbf{L}ocal \\textbf{S}trip \\textbf{M}ixing \\textbf{M}odule (LSMM) to boost\nthe token interaction power in the local region. Extensive experiments\ndemonstrate that Strip-MLP significantly improves the performance of MLP-based\nmodels on small datasets and obtains comparable or even better results on\nImageNet. In particular, Strip-MLP models achieve higher average Top-1 accuracy\nthan existing MLP-based models by +2.44\\% on Caltech-101 and +2.16\\% on\nCIFAR-100. The source codes will be available\nat~\\href{https://github.com/Med-Process/Strip_MLP{https://github.com/Med-Process/Strip\\_MLP}.\n","authors":["Guiping Cao","Shengda Luo","Wenjian Huang","Xiangyuan Lan","Dongmei Jiang","Yaowei Wang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08092v2","updated":"2023-07-21T09:31:57Z","published":"2023-07-16T16:29:26Z","title":"Gait Data Augmentation using Physics-Based Biomechanical Simulation","summary":" This paper focuses on addressing the problem of data scarcity for gait\nanalysis. Standard augmentation methods may produce gait sequences that are not\nconsistent with the biomechanical constraints of human walking. To address this\nissue, we propose a novel framework for gait data augmentation by using\nOpenSIM, a physics-based simulator, to synthesize biomechanically plausible\nwalking sequences. The proposed approach is validated by augmenting the WBDS\nand CASIA-B datasets and then training gait-based classifiers for 3D gender\ngait classification and 2D gait person identification respectively.\nExperimental results indicate that our augmentation approach can improve the\nperformance of model-based gait classifiers and deliver state-of-the-art\nresults for gait-based person identification with an accuracy of up to 96.11%\non the CASIA-B dataset.\n","authors":["Mritula Chandrasekaran","Jarek Francik","Dimitrios Makris"],"pdf_url":"https://arxiv.org/pdf/2307.08092v2.pdf","comment":"30 pages including references, 5 Figures submitted to ESWA"},{"id":"http://arxiv.org/abs/2307.02953v2","updated":"2023-07-21T09:26:06Z","published":"2023-07-06T12:39:06Z","title":"SegNetr: Rethinking the local-global interactions and skip connections\n in U-shaped networks","summary":" Recently, U-shaped networks have dominated the field of medical image\nsegmentation due to their simple and easily tuned structure. However, existing\nU-shaped segmentation networks: 1) mostly focus on designing complex\nself-attention modules to compensate for the lack of long-term dependence based\non convolution operation, which increases the overall number of parameters and\ncomputational complexity of the network; 2) simply fuse the features of encoder\nand decoder, ignoring the connection between their spatial locations. In this\npaper, we rethink the above problem and build a lightweight medical image\nsegmentation network, called SegNetr. Specifically, we introduce a novel\nSegNetr block that can perform local-global interactions dynamically at any\nstage and with only linear complexity. At the same time, we design a general\ninformation retention skip connection (IRSC) to preserve the spatial location\ninformation of encoder features and achieve accurate fusion with the decoder\nfeatures. We validate the effectiveness of SegNetr on four mainstream medical\nimage segmentation datasets, with 59\\% and 76\\% fewer parameters and GFLOPs\nthan vanilla U-Net, while achieving segmentation performance comparable to\nstate-of-the-art methods. Notably, the components proposed in this paper can be\napplied to other U-shaped networks to improve their segmentation performance.\n","authors":["Junlong Cheng","Chengrui Gao","Fengjie Wang","Min Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.02953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04246v2","updated":"2023-07-21T09:15:42Z","published":"2023-02-08T18:26:10Z","title":"Shortcut Detection with Variational Autoencoders","summary":" For real-world applications of machine learning (ML), it is essential that\nmodels make predictions based on well-generalizing features rather than\nspurious correlations in the data. The identification of such spurious\ncorrelations, also known as shortcuts, is a challenging problem and has so far\nbeen scarcely addressed. In this work, we present a novel approach to detect\nshortcuts in image and audio datasets by leveraging variational autoencoders\n(VAEs). The disentanglement of features in the latent space of VAEs allows us\nto discover feature-target correlations in datasets and semi-automatically\nevaluate them for ML shortcuts. We demonstrate the applicability of our method\non several real-world datasets and identify shortcuts that have not been\ndiscovered before.\n","authors":["Nicolas M. Müller","Simon Roschmann","Shahbaz Khan","Philip Sperl","Konstantin Böttinger"],"pdf_url":"https://arxiv.org/pdf/2302.04246v2.pdf","comment":"Accepted at the ICML 2023 Workshop on Spurious Correlations,\n Invariance and Stability"},{"id":"http://arxiv.org/abs/2307.04378v3","updated":"2023-07-21T09:13:55Z","published":"2023-07-10T07:24:44Z","title":"Towards Generalizable Diabetic Retinopathy Grading in Unseen Domains","summary":" Diabetic Retinopathy (DR) is a common complication of diabetes and a leading\ncause of blindness worldwide. Early and accurate grading of its severity is\ncrucial for disease management. Although deep learning has shown great\npotential for automated DR grading, its real-world deployment is still\nchallenging due to distribution shifts among source and target domains, known\nas the domain generalization problem. Existing works have mainly attributed the\nperformance degradation to limited domain shifts caused by simple visual\ndiscrepancies, which cannot handle complex real-world scenarios. Instead, we\npresent preliminary evidence suggesting the existence of three-fold\ngeneralization issues: visual and degradation style shifts, diagnostic pattern\ndiversity, and data imbalance. To tackle these issues, we propose a novel\nunified framework named Generalizable Diabetic Retinopathy Grading Network\n(GDRNet). GDRNet consists of three vital components: fundus visual-artifact\naugmentation (FundusAug), dynamic hybrid-supervised loss (DahLoss), and\ndomain-class-aware re-balancing (DCR). FundusAug generates realistic augmented\nimages via visual transformation and image degradation, while DahLoss jointly\nleverages pixel-level consistency and image-level semantics to capture the\ndiverse diagnostic patterns and build generalizable feature representations.\nMoreover, DCR mitigates the data imbalance from a domain-class view and avoids\nundesired over-emphasis on rare domain-class pairs. Finally, we design a\npublicly available benchmark for fair evaluations. Extensive comparison\nexperiments against advanced methods and exhaustive ablation studies\ndemonstrate the effectiveness and generalization ability of GDRNet.\n","authors":["Haoxuan Che","Yuhan Cheng","Haibo Jin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.04378v3.pdf","comment":"Early Accepted by MICCAI 2023, the 26th International Conference on\n Medical Image Computing and Computer Assisted Intervention"},{"id":"http://arxiv.org/abs/2305.18310v2","updated":"2023-07-21T09:12:17Z","published":"2023-05-17T14:14:31Z","title":"Motion-Scenario Decoupling for Rat-Aware Video Position Prediction:\n Strategy and Benchmark","summary":" Recently significant progress has been made in human action recognition and\nbehavior prediction using deep learning techniques, leading to improved\nvision-based semantic understanding. However, there is still a lack of\nhigh-quality motion datasets for small bio-robotics, which presents more\nchallenging scenarios for long-term movement prediction and behavior control\nbased on third-person observation. In this study, we introduce RatPose, a\nbio-robot motion prediction dataset constructed by considering the influence\nfactors of individuals and environments based on predefined annotation rules.\nTo enhance the robustness of motion prediction against these factors, we\npropose a Dual-stream Motion-Scenario Decoupling (\\textit{DMSD}) framework that\neffectively separates scenario-oriented and motion-oriented features and\ndesigns a scenario contrast loss and motion clustering loss for overall\ntraining. With such distinctive architecture, the dual-branch feature flow\ninformation is interacted and compensated in a decomposition-then-fusion\nmanner. Moreover, we demonstrate significant performance improvements of the\nproposed \\textit{DMSD} framework on different difficulty-level tasks. We also\nimplement long-term discretized trajectory prediction tasks to verify the\ngeneralization ability of the proposed dataset.\n","authors":["Xiaofeng Liu","Jiaxin Gao","Yaohua Liu","Risheng Liu","Nenggan Zheng"],"pdf_url":"https://arxiv.org/pdf/2305.18310v2.pdf","comment":"Rat, Video Position Prediction"},{"id":"http://arxiv.org/abs/2303.09975v4","updated":"2023-07-21T09:05:53Z","published":"2023-03-17T13:48:17Z","title":"MedNeXt: Transformer-driven Scaling of ConvNets for Medical Image\n Segmentation","summary":" There has been exploding interest in embracing Transformer-based\narchitectures for medical image segmentation. However, the lack of large-scale\nannotated medical datasets make achieving performances equivalent to those in\nnatural images challenging. Convolutional networks, in contrast, have higher\ninductive biases and consequently, are easily trainable to high performance.\nRecently, the ConvNeXt architecture attempted to modernize the standard ConvNet\nby mirroring Transformer blocks. In this work, we improve upon this to design a\nmodernized and scalable convolutional architecture customized to challenges of\ndata-scarce medical settings. We introduce MedNeXt, a Transformer-inspired\nlarge kernel segmentation network which introduces - 1) A fully ConvNeXt 3D\nEncoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up\nand downsampling blocks to preserve semantic richness across scales, 3) A novel\ntechnique to iteratively increase kernel sizes by upsampling small kernel\nnetworks, to prevent performance saturation on limited medical data, 4)\nCompound scaling at multiple levels (depth, width, kernel size) of MedNeXt.\nThis leads to state-of-the-art performance on 4 tasks on CT and MRI modalities\nand varying dataset sizes, representing a modernized deep architecture for\nmedical image segmentation. Our code is made publicly available at:\nhttps://github.com/MIC-DKFZ/MedNeXt.\n","authors":["Saikat Roy","Gregor Koehler","Constantin Ulrich","Michael Baumgartner","Jens Petersen","Fabian Isensee","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.09975v4.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11438v1","updated":"2023-07-21T08:58:49Z","published":"2023-07-21T08:58:49Z","title":"Attention Consistency Refined Masked Frequency Forgery Representation\n for Generalizing Face Forgery Detection","summary":" Due to the successful development of deep image generation technology, visual\ndata forgery detection would play a more important role in social and economic\nsecurity. Existing forgery detection methods suffer from unsatisfactory\ngeneralization ability to determine the authenticity in the unseen domain. In\nthis paper, we propose a novel Attention Consistency Refined masked frequency\nforgery representation model toward generalizing face forgery detection\nalgorithm (ACMF). Most forgery technologies always bring in high-frequency\naware cues, which make it easy to distinguish source authenticity but difficult\nto generalize to unseen artifact types. The masked frequency forgery\nrepresentation module is designed to explore robust forgery cues by randomly\ndiscarding high-frequency information. In addition, we find that the forgery\nattention map inconsistency through the detection network could affect the\ngeneralizability. Thus, the forgery attention consistency is introduced to\nforce detectors to focus on similar attention regions for better generalization\nability. Experiment results on several public face forgery datasets\n(FaceForensic++, DFD, Celeb-DF, and WDF datasets) demonstrate the superior\nperformance of the proposed method compared with the state-of-the-art methods.\n","authors":["Decheng Liu","Tao Chen","Chunlei Peng","Nannan Wang","Ruimin Hu","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2307.11438v1.pdf","comment":"The source code and models are publicly available at\n https://github.com/chenboluo/ACMF"},{"id":"http://arxiv.org/abs/2307.11434v1","updated":"2023-07-21T08:55:23Z","published":"2023-07-21T08:55:23Z","title":"Batching for Green AI -- An Exploratory Study on Inference","summary":" The batch size is an essential parameter to tune during the development of\nnew neural networks. Amongst other quality indicators, it has a large degree of\ninfluence on the model's accuracy, generalisability, training times and\nparallelisability. This fact is generally known and commonly studied. However,\nduring the application phase of a deep learning model, when the model is\nutilised by an end-user for inference, we find that there is a disregard for\nthe potential benefits of introducing a batch size. In this study, we examine\nthe effect of input batching on the energy consumption and response times of\nfive fully-trained neural networks for computer vision that were considered\nstate-of-the-art at the time of their publication. The results suggest that\nbatching has a significant effect on both of these metrics. Furthermore, we\npresent a timeline of the energy efficiency and accuracy of neural networks\nover the past decade. We find that in general, energy consumption rises at a\nmuch steeper pace than accuracy and question the necessity of this evolution.\nAdditionally, we highlight one particular network, ShuffleNetV2(2018), that\nachieved a competitive performance for its time while maintaining a much lower\nenergy consumption. Nevertheless, we highlight that the results are model\ndependent.\n","authors":["Tim Yarally","Luís Cruz","Daniel Feitosa","June Sallou","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2307.11434v1.pdf","comment":"8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series\n on Software Engineering and Advanced Applications (SEAA) 2023"},{"id":"http://arxiv.org/abs/2307.09004v2","updated":"2023-07-21T08:41:23Z","published":"2023-07-18T06:44:20Z","title":"Ord2Seq: Regarding Ordinal Regression as Label Sequence Prediction","summary":" Ordinal regression refers to classifying object instances into ordinal\ncategories. It has been widely studied in many scenarios, such as medical\ndisease grading, movie rating, etc. Known methods focused only on learning\ninter-class ordinal relationships, but still incur limitations in\ndistinguishing adjacent categories thus far. In this paper, we propose a simple\nsequence prediction framework for ordinal regression called Ord2Seq, which, for\nthe first time, transforms each ordinal category label into a special label\nsequence and thus regards an ordinal regression task as a sequence prediction\nprocess. In this way, we decompose an ordinal regression task into a series of\nrecursive binary classification steps, so as to subtly distinguish adjacent\ncategories. Comprehensive experiments show the effectiveness of distinguishing\nadjacent categories for performance improvement and our new approach exceeds\nstate-of-the-art performances in four different scenarios. Codes are available\nat https://github.com/wjh892521292/Ord2Seq.\n","authors":["Jinhong Wang","Yi Cheng","Jintai Chen","Tingting Chen","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09004v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2208.00657v2","updated":"2023-07-21T08:39:22Z","published":"2022-08-01T07:35:45Z","title":"SiamixFormer: a fully-transformer Siamese network with temporal Fusion\n for accurate building detection and change detection in bi-temporal remote\n sensing images","summary":" Building detection and change detection using remote sensing images can help\nurban and rescue planning. Moreover, they can be used for building damage\nassessment after natural disasters. Currently, most of the existing models for\nbuilding detection use only one image (pre-disaster image) to detect buildings.\nThis is based on the idea that post-disaster images reduce the model's\nperformance because of presence of destroyed buildings. In this paper, we\npropose a siamese model, called SiamixFormer, which uses pre- and post-disaster\nimages as input. Our model has two encoders and has a hierarchical transformer\narchitecture. The output of each stage in both encoders is given to a temporal\ntransformer for feature fusion in a way that query is generated from\npre-disaster images and (key, value) is generated from post-disaster images. To\nthis end, temporal features are also considered in feature fusion. Another\nadvantage of using temporal transformers in feature fusion is that they can\nbetter maintain large receptive fields generated by transformer encoders\ncompared with CNNs. Finally, the output of the temporal transformer is given to\na simple MLP decoder at each stage. The SiamixFormer model is evaluated on xBD,\nand WHU datasets, for building detection and on LEVIR-CD and CDD datasets for\nchange detection and could outperform the state-of-the-art.\n","authors":["Amir Mohammadian","Foad Ghaderi"],"pdf_url":"https://arxiv.org/pdf/2208.00657v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11418v1","updated":"2023-07-21T08:22:14Z","published":"2023-07-21T08:22:14Z","title":"FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural\n Radiance Fields","summary":" As recent advances in Neural Radiance Fields (NeRF) have enabled\nhigh-fidelity 3D face reconstruction and novel view synthesis, its manipulation\nalso became an essential task in 3D vision. However, existing manipulation\nmethods require extensive human labor, such as a user-provided semantic mask\nand manual attribute search unsuitable for non-expert users. Instead, our\napproach is designed to require a single text to manipulate a face\nreconstructed with NeRF. To do so, we first train a scene manipulator, a latent\ncode-conditional deformable NeRF, over a dynamic scene to control a face\ndeformation using the latent code. However, representing a scene deformation\nwith a single latent code is unfavorable for compositing local deformations\nobserved in different instances. As so, our proposed Position-conditional\nAnchor Compositor (PAC) learns to represent a manipulated scene with spatially\nvarying latent codes. Their renderings with the scene manipulator are then\noptimized to yield high cosine similarity to a target text in CLIP embedding\nspace for text-driven manipulation. To the best of our knowledge, our approach\nis the first to address the text-driven manipulation of a face reconstructed\nwith NeRF. Extensive results, comparisons, and ablation studies demonstrate the\neffectiveness of our approach.\n","authors":["Sungwon Hwang","Junha Hyung","Daejin Kim","Min-Jung Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2307.11418v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11413v1","updated":"2023-07-21T08:15:39Z","published":"2023-07-21T08:15:39Z","title":"A Video-based Detector for Suspicious Activity in Examination with\n OpenPose","summary":" Examinations are a crucial part of the learning process, and academic\ninstitutions invest significant resources into maintaining their integrity by\npreventing cheating from students or facilitators. However, cheating has become\nrampant in examination setups, compromising their integrity. The traditional\nmethod of relying on invigilators to monitor every student is impractical and\nineffective. To address this issue, there is a need to continuously record exam\nsessions to monitor students for suspicious activities. However, these\nrecordings are often too lengthy for invigilators to analyze effectively, and\nfatigue may cause them to miss significant details. To widen the coverage,\ninvigilators could use fixed overhead or wearable cameras. This paper\nintroduces a framework that uses automation to analyze videos and detect\nsuspicious activities during examinations efficiently and effectively. We\nutilized the OpenPose framework and Convolutional Neural Network (CNN) to\nidentify students exchanging objects during exams. This detection system is\nvital in preventing cheating and promoting academic integrity, fairness, and\nquality education for institutions.\n","authors":["Reuben Moyo","Stanley Ndebvu","Michael Zimba","Jimmy Mbelwa"],"pdf_url":"https://arxiv.org/pdf/2307.11413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11411v1","updated":"2023-07-21T08:10:26Z","published":"2023-07-21T08:10:26Z","title":"Deep Directly-Trained Spiking Neural Networks for Object Detection","summary":" Spiking neural networks (SNNs) are brain-inspired energy-efficient models\nthat encode information in spatiotemporal dynamics. Recently, deep SNNs trained\ndirectly have shown great success in achieving high performance on\nclassification tasks with very few time steps. However, how to design a\ndirectly-trained SNN for the regression task of object detection still remains\na challenging problem. To address this problem, we propose EMS-YOLO, a novel\ndirectly-trained SNN framework for object detection, which is the first trial\nto train a deep SNN with surrogate gradients for object detection rather than\nANN-SNN conversion strategies. Specifically, we design a full-spike residual\nblock, EMS-ResNet, which can effectively extend the depth of the\ndirectly-trained SNN with low power consumption. Furthermore, we theoretically\nanalyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.\nThe results demonstrate that our approach outperforms the state-of-the-art\nANN-SNN conversion methods (at least 500 time steps) in extremely fewer time\nsteps (only 4 time steps). It is shown that our model could achieve comparable\nperformance to the ANN with the same architecture while consuming 5.83 times\nless energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.\n","authors":["Qiaoyi Su","Yuhong Chou","Yifan Hu","Jianing Li","Shijie Mei","Ziyang Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2307.11411v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.11410v1","updated":"2023-07-21T08:09:47Z","published":"2023-07-21T08:09:47Z","title":"Subject-Diffusion:Open Domain Personalized Text-to-Image Generation\n without Test-time Fine-tuning","summary":" Recent progress in personalized image generation using diffusion models has\nbeen significant. However, development in the area of open-domain and\nnon-fine-tuning personalized image generation is proceeding rather slowly. In\nthis paper, we propose Subject-Diffusion, a novel open-domain personalized\nimage generation model that, in addition to not requiring test-time\nfine-tuning, also only requires a single reference image to support\npersonalized generation of single- or multi-subject in any domain. Firstly, we\nconstruct an automatic data labeling tool and use the LAION-Aesthetics dataset\nto construct a large-scale dataset consisting of 76M images and their\ncorresponding subject detection bounding boxes, segmentation masks and text\ndescriptions. Secondly, we design a new unified framework that combines text\nand image semantics by incorporating coarse location and fine-grained reference\nimage control to maximize subject fidelity and generalization. Furthermore, we\nalso adopt an attention control mechanism to support multi-subject generation.\nExtensive qualitative and quantitative results demonstrate that our method\noutperforms other SOTA frameworks in single, multiple, and human customized\nimage generation. Please refer to our\n\\href{https://oppo-mente-lab.github.io/subject_diffusion/}{project page}\n","authors":["Jian Ma","Junhao Liang","Chen Chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2307.11410v1.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.11404v1","updated":"2023-07-21T07:56:32Z","published":"2023-07-21T07:56:32Z","title":"Latent-OFER: Detect, Mask, and Reconstruct with Latent Vectors for\n Occluded Facial Expression Recognition","summary":" Most research on facial expression recognition (FER) is conducted in highly\ncontrolled environments, but its performance is often unacceptable when applied\nto real-world situations. This is because when unexpected objects occlude the\nface, the FER network faces difficulties extracting facial features and\naccurately predicting facial expressions. Therefore, occluded FER (OFER) is a\nchallenging problem. Previous studies on occlusion-aware FER have typically\nrequired fully annotated facial images for training. However, collecting facial\nimages with various occlusions and expression annotations is time-consuming and\nexpensive. Latent-OFER, the proposed method, can detect occlusions, restore\noccluded parts of the face as if they were unoccluded, and recognize them,\nimproving FER accuracy. This approach involves three steps: First, the vision\ntransformer (ViT)-based occlusion patch detector masks the occluded position by\ntraining only latent vectors from the unoccluded patches using the support\nvector data description algorithm. Second, the hybrid reconstruction network\ngenerates the masking position as a complete image using the ViT and\nconvolutional neural network (CNN). Last, the expression-relevant latent vector\nextractor retrieves and uses expression-related information from all latent\nvectors by applying a CNN-based class activation map. This mechanism has a\nsignificant advantage in preventing performance degradation from occlusion by\nunseen objects. The experimental results on several databases demonstrate the\nsuperiority of the proposed method over state-of-the-art methods.\n","authors":["Isack Lee","Eungi Lee","Seok Bong Yoo"],"pdf_url":"https://arxiv.org/pdf/2307.11404v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.11397v1","updated":"2023-07-21T07:29:38Z","published":"2023-07-21T07:29:38Z","title":"Probabilistic Modeling of Inter- and Intra-observer Variability in\n Medical Image Segmentation","summary":" Medical image segmentation is a challenging task, particularly due to inter-\nand intra-observer variability, even between medical experts. In this paper, we\npropose a novel model, called Probabilistic Inter-Observer and iNtra-Observer\nvariation NetwOrk (Pionono). It captures the labeling behavior of each rater\nwith a multidimensional probability distribution and integrates this\ninformation with the feature maps of the image to produce probabilistic\nsegmentation predictions. The model is optimized by variational inference and\ncan be trained end-to-end. It outperforms state-of-the-art models such as\nSTAPLE, Probabilistic U-Net, and models based on confusion matrices.\nAdditionally, Pionono predicts multiple coherent segmentation maps that mimic\nthe rater's expert opinion, which provides additional valuable information for\nthe diagnostic process. Experiments on real-world cancer segmentation datasets\ndemonstrate the high accuracy and efficiency of Pionono, making it a powerful\ntool for medical image analysis.\n","authors":["Arne Schmidt","Pablo Morales-Álvarez","Rafael Molina"],"pdf_url":"https://arxiv.org/pdf/2307.11397v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.09815v2","updated":"2023-07-21T07:10:28Z","published":"2023-07-19T08:03:53Z","title":"LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network","summary":" Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent\nblur is a challenging task.~Existing blur map-based deblurring methods have\ndemonstrated promising results. In this paper, we propose, to the best of our\nknowledge, the first framework to introduce the contrastive language-image\npre-training framework (CLIP) to achieve accurate blur map estimation from DP\npairs unsupervisedly. To this end, we first carefully design text prompts to\nenable CLIP to understand blur-related geometric prior knowledge from the DP\npair. Then, we propose a format to input stereo DP pair to the CLIP without any\nfine-tuning, where the CLIP is pre-trained on monocular images. Given the\nestimated blur map, we introduce a blur-prior attention block, a blur-weighting\nloss and a blur-aware loss to recover the all-in-focus image. Our method\nachieves state-of-the-art performance in extensive experiments.\n","authors":["Hao Yang","Liyuan Pan","Yan Yang","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10577v2","updated":"2023-07-21T06:59:21Z","published":"2023-07-20T04:41:39Z","title":"Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced\n Perception based on Joint-Embedding & Contextual Label Affinity","summary":" Traditional computer vision models often require extensive manual effort for\ndata acquisition, annotation and validation, particularly when detecting subtle\nbehavioral nuances or events. The difficulty in distinguishing routine\nbehaviors from potential risks in real-world applications, such as\ndifferentiating routine shopping from potential shoplifting, further\ncomplicates the process. Moreover, these models may demonstrate high false\npositive rates and imprecise event detection when exposed to real-world\nscenarios that differ significantly from the conditions of the training data.\n To overcome these hurdles, we present Ethosight, a novel zero-shot computer\nvision system. Ethosight initiates with a clean slate based on user\nrequirements and semantic knowledge of interest. Using localized label affinity\ncalculations and a reasoning-guided iterative learning loop, Ethosight infers\nscene details and iteratively refines the label set. Reasoning mechanisms can\nbe derived from large language models like GPT4, symbolic reasoners like\nOpenNARS\\cite{wang2013}\\cite{wang2006}, or hybrid systems.\n Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases,\nspanning domains such as health, safety, and security. Detailed results and\ncase studies within the main body of this paper and an appendix underscore a\npromising trajectory towards enhancing the adaptability and resilience of\ncomputer vision models in detecting and extracting subtle and nuanced\nbehaviors.\n","authors":["Hugo Latapie","Kristinn R. Thorisson","Shan Yu","Vahagn Petrosyan","Patrick Hammer","Pei Wang","Brandon Kynoch","Hanning Chen","Tangrui Li"],"pdf_url":"https://arxiv.org/pdf/2307.10577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11386v1","updated":"2023-07-21T06:56:21Z","published":"2023-07-21T06:56:21Z","title":"CLR: Channel-wise Lightweight Reprogramming for Continual Learning","summary":" Continual learning aims to emulate the human ability to continually\naccumulate knowledge over sequential tasks. The main challenge is to maintain\nperformance on previously learned tasks after learning new tasks, i.e., to\navoid catastrophic forgetting. We propose a Channel-wise Lightweight\nReprogramming (CLR) approach that helps convolutional neural networks (CNNs)\novercome catastrophic forgetting during continual learning. We show that a CNN\nmodel trained on an old task (or self-supervised proxy task) could be\n``reprogrammed\" to solve a new task by using our proposed lightweight (very\ncheap) reprogramming parameter. With the help of CLR, we have a better\nstability-plasticity trade-off to solve continual learning problems: To\nmaintain stability and retain previous task ability, we use a common\ntask-agnostic immutable part as the shared ``anchor\" parameter set. We then add\ntask-specific lightweight reprogramming parameters to reinterpret the outputs\nof the immutable parts, to enable plasticity and integrate new knowledge. To\nlearn sequential tasks, we only train the lightweight reprogramming parameters\nto learn each new task. Reprogramming parameters are task-specific and\nexclusive to each task, which makes our method immune to catastrophic\nforgetting. To minimize the parameter requirement of reprogramming to learn new\ntasks, we make reprogramming lightweight by only adjusting essential kernels\nand learning channel-wise linear mappings from anchor parameters to\ntask-specific domain knowledge. We show that, for general CNNs, the CLR\nparameter increase is less than 0.6\\% for any new task. Our method outperforms\n13 state-of-the-art continual learning baselines on a new challenging sequence\nof 53 image classification datasets. Code and data are available at\nhttps://github.com/gyhandy/Channel-wise-Lightweight-Reprogramming\n","authors":["Yunhao Ge","Yuecheng Li","Shuo Ni","Jiaping Zhao","Ming-Hsuan Yang","Laurent Itti"],"pdf_url":"https://arxiv.org/pdf/2307.11386v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.06146v2","updated":"2023-07-21T06:34:54Z","published":"2023-03-10T18:59:33Z","title":"StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces","summary":" Recent advances in face manipulation using StyleGAN have produced impressive\nresults. However, StyleGAN is inherently limited to cropped aligned faces at a\nfixed image resolution it is pre-trained on. In this paper, we propose a simple\nand effective solution to this limitation by using dilated convolutions to\nrescale the receptive fields of shallow layers in StyleGAN, without altering\nany model parameters. This allows fixed-size small features at shallow layers\nto be extended into larger ones that can accommodate variable resolutions,\nmaking them more robust in characterizing unaligned faces. To enable real face\ninversion and manipulation, we introduce a corresponding encoder that provides\nthe first-layer feature of the extended StyleGAN in addition to the latent\nstyle code. We validate the effectiveness of our method using unaligned face\ninputs of various resolutions in a diverse set of face manipulation tasks,\nincluding facial attribute editing, super-resolution, sketch/mask-to-face\ntranslation, and face toonification.\n","authors":["Shuai Yang","Liming Jiang","Ziwei Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2303.06146v2.pdf","comment":"ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX\n Project page: https://www.mmlab-ntu.com/project/styleganex/"},{"id":"http://arxiv.org/abs/2307.11375v1","updated":"2023-07-21T06:17:09Z","published":"2023-07-21T06:17:09Z","title":"LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent\n Space","summary":" Data Augmentation (DA) is a technique to increase the quantity and diversity\nof the training data, and by that alleviate overfitting and improve\ngeneralisation. However, standard DA produces synthetic data for augmentation\nwith limited diversity. Generative Adversarial Networks (GANs) may unlock\nadditional information in a dataset by generating synthetic samples having the\nappearance of real images. However, these models struggle to simultaneously\naddress three key requirements: fidelity and high-quality samples; diversity\nand mode coverage; and fast sampling. Indeed, GANs generate high-quality\nsamples rapidly, but have poor mode coverage, limiting their adoption in DA\napplications. We propose LatentAugment, a DA strategy that overcomes the low\ndiversity of GANs, opening up for use in DA applications. Without external\nsupervision, LatentAugment modifies latent vectors and moves them into latent\nspace regions to maximise the synthetic images' diversity and fidelity. It is\nalso agnostic to the dataset and the downstream task. A wide set of experiments\nshows that LatentAugment improves the generalisation of a deep model\ntranslating from MRI-to-CT beating both standard DA as well GAN-based sampling.\nMoreover, still in comparison with GAN-based sampling, LatentAugment synthetic\nsamples show superior mode coverage and diversity. Code is available at:\nhttps://github.com/ltronchin/LatentAugment.\n","authors":["Lorenzo Tronchin","Minh H. Vu","Paolo Soda","Tommy Löfstedt"],"pdf_url":"https://arxiv.org/pdf/2307.11375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16649v4","updated":"2023-07-21T05:46:30Z","published":"2023-05-26T05:41:20Z","title":"FSD: Fully-Specialized Detector via Neural Architecture Search","summary":" Most generic object detectors are mainly built for standard object detection\ntasks such as COCO and PASCAL VOC. They might not work well and/or efficiently\non tasks of other domains consisting of images that are visually different from\nstandard datasets. To this end, many advances have been focused on adapting a\ngeneral-purposed object detector with limited domain-specific designs. However,\ndesigning a successful task-specific detector requires extraneous manual\nexperiments and parameter tuning through trial and error. In this paper, we\nfirst propose and examine a fully-automatic pipeline to design a\nfully-specialized detector (FSD) which mainly incorporates a\nneural-architectural-searched model by exploring ideal network structures over\nthe backbone and task-specific head. On the DeepLesion dataset, extensive\nresults show that FSD can achieve 3.1 mAP gain while using approximately 40%\nfewer parameters on binary lesion detection task and improved the mAP by around\n10% on multi-type lesion detection task via our region-aware graph modeling\ncompared with existing general-purposed medical lesion detection networks.\n","authors":["Zhe Huang","Yudian Li"],"pdf_url":"https://arxiv.org/pdf/2305.16649v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11364v1","updated":"2023-07-21T05:33:57Z","published":"2023-07-21T05:33:57Z","title":"Photo2Relief: Let Human in the Photograph Stand Out","summary":" In this paper, we propose a technique for making humans in photographs\nprotrude like reliefs. Unlike previous methods which mostly focus on the face\nand head, our method aims to generate art works that describe the whole body\nactivity of the character. One challenge is that there is no ground-truth for\nsupervised deep learning. We introduce a sigmoid variant function to manipulate\ngradients tactfully and train our neural networks by equipping with a loss\nfunction defined in gradient domain. The second challenge is that actual\nphotographs often across different light conditions. We used image-based\nrendering technique to address this challenge and acquire rendering images and\ndepth data under different lighting conditions. To make a clear division of\nlabor in network modules, a two-scale architecture is proposed to create\nhigh-quality relief from a single photograph. Extensive experimental results on\na variety of scenes show that our method is a highly effective solution for\ngenerating digital 2.5D artwork from photographs.\n","authors":["Zhongping Ji","Feifei Che","Hanshuo Liu","Ziyi Zhao","Yu-Wei Zhang","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11364v1.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.11360v1","updated":"2023-07-21T05:26:32Z","published":"2023-07-21T05:26:32Z","title":"ParGANDA: Making Synthetic Pedestrians A Reality For Object Detection","summary":" Object detection is the key technique to a number of Computer Vision\napplications, but it often requires large amounts of annotated data to achieve\ndecent results. Moreover, for pedestrian detection specifically, the collected\ndata might contain some personally identifiable information (PII), which is\nhighly restricted in many countries. This label intensive and privacy\nconcerning task has recently led to an increasing interest in training the\ndetection models using synthetically generated pedestrian datasets collected\nwith a photo-realistic video game engine. The engine is able to generate\nunlimited amounts of data with precise and consistent annotations, which gives\npotential for significant gains in the real-world applications. However, the\nuse of synthetic data for training introduces a synthetic-to-real domain shift\naggravating the final performance. To close the gap between the real and\nsynthetic data, we propose to use a Generative Adversarial Network (GAN), which\nperformsparameterized unpaired image-to-image translation to generate more\nrealistic images. The key benefit of using the GAN is its intrinsic preference\nof low-level changes to geometric ones, which means annotations of a given\nsynthetic image remain accurate even after domain translation is performed thus\neliminating the need for labeling real data. We extensively experimented with\nthe proposed method using MOTSynth dataset to train and MOT17 and MOT20\ndetection datasets to test, with experimental results demonstrating the\neffectiveness of this method. Our approach not only produces visually plausible\nsamples but also does not require any labels of the real domain thus making it\napplicable to the variety of downstream tasks.\n","authors":["Daria Reshetova","Guanhang Wu","Marcel Puyat","Chunhui Gu","Huizhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04541v2","updated":"2023-07-21T05:08:44Z","published":"2023-07-10T13:09:42Z","title":"Learning Large Margin Sparse Embeddings for Open Set Medical Diagnosis","summary":" Fueled by deep learning, computer-aided diagnosis achieves huge advances.\nHowever, out of controlled lab environments, algorithms could face multiple\nchallenges. Open set recognition (OSR), as an important one, states that\ncategories unseen in training could appear in testing. In medical fields, it\ncould derive from incompletely collected training datasets and the constantly\nemerging new or rare diseases. OSR requires an algorithm to not only correctly\nclassify known classes, but also recognize unknown classes and forward them to\nexperts for further diagnosis. To tackle OSR, we assume that known classes\ncould densely occupy small parts of the embedding space and the remaining\nsparse regions could be recognized as unknowns. Following it, we propose Open\nMargin Cosine Loss (OMCL) unifying two mechanisms. The former, called Margin\nLoss with Adaptive Scale (MLAS), introduces angular margin for reinforcing\nintra-class compactness and inter-class separability, together with an adaptive\nscaling factor to strengthen the generalization capacity. The latter, called\nOpen-Space Suppression (OSS), opens the classifier by recognizing sparse\nembedding space as unknowns using proposed feature space descriptors. Besides,\nsince medical OSR is still a nascent field, two publicly available benchmark\ndatasets are proposed for comparison. Extensive ablation studies and feature\nvisualization demonstrate the effectiveness of each design. Compared with\nstate-of-the-art methods, MLAS achieves superior performances, measured by ACC,\nAUROC, and OSCR.\n","authors":["Mingyuan Liu","Lu Xu","Jicong Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.04541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10698v2","updated":"2023-07-21T05:05:52Z","published":"2023-07-20T08:39:20Z","title":"Reverse Knowledge Distillation: Training a Large Model using a Small One\n for Retinal Image Matching on Limited Data","summary":" Retinal image matching plays a crucial role in monitoring disease progression\nand treatment response. However, datasets with matched keypoints between\ntemporally separated pairs of images are not available in abundance to train\ntransformer-based model. We propose a novel approach based on reverse knowledge\ndistillation to train large models with limited data while preventing\noverfitting. Firstly, we propose architectural modifications to a CNN-based\nsemi-supervised method called SuperRetina that help us improve its results on a\npublicly available dataset. Then, we train a computationally heavier model\nbased on a vision transformer encoder using the lighter CNN-based model, which\nis counter-intuitive in the field knowledge-distillation research where\ntraining lighter models based on heavier ones is the norm. Surprisingly, such\nreverse knowledge distillation improves generalization even further. Our\nexperiments suggest that high-dimensional fitting in representation space may\nprevent overfitting unlike training directly to match the final output. We also\nprovide a public dataset with annotations for retinal image keypoint detection\nand matching to help the research community develop algorithms for retinal\nimage applications.\n","authors":["Sahar Almahfouz Nasser","Nihar Gupte","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2307.10698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10829v2","updated":"2023-07-21T04:46:07Z","published":"2023-07-10T12:18:18Z","title":"Exact Diffusion Inversion via Bi-directional Integration Approximation","summary":" Recently, different methods have been proposed to address the inconsistency\nissue of DDIM inversion to enable image editing, such as EDICT\n\\cite{Wallace23EDICT} and Null-text inversion \\cite{Mokady23NullTestInv}.\nHowever, the above methods introduce considerable computational overhead. In\nthis paper, we propose a new technique, named \\emph{bi-directional integration\napproximation} (BDIA), to perform exact diffusion inversion with neglible\ncomputational overhead. Suppose we would like to estimate the next diffusion\nstate $\\boldsymbol{z}_{i-1}$ at timestep $t_i$ with the historical information\n$(i,\\boldsymbol{z}_i)$ and $(i+1,\\boldsymbol{z}_{i+1})$. We first obtain the\nestimated Gaussian noise $\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i)$, and\nthen apply the DDIM update procedure twice for approximating the ODE\nintegration over the next time-slot $[t_i, t_{i-1}]$ in the forward manner and\nthe previous time-slot $[t_i, t_{t+1}]$ in the backward manner. The DDIM step\nfor the previous time-slot is used to refine the integration approximation made\nearlier when computing $\\boldsymbol{z}_i$. One nice property with BDIA-DDIM is\nthat the update expression for $\\boldsymbol{z}_{i-1}$ is a linear combination\nof $(\\boldsymbol{z}_{i+1}, \\boldsymbol{z}_i,\n\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i))$. This allows for exact\nbackward computation of $\\boldsymbol{z}_{i+1}$ given $(\\boldsymbol{z}_i,\n\\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. Experiments\non both image reconstruction and image editing were conducted, confirming our\nstatement. BDIA can also be applied to improve the performance of other ODE\nsolvers in addition to DDIM. In our work, it is found that applying BDIA to the\nEDM sampling procedure produces slightly better FID score over CIFAR10.\n","authors":["Guoqiang Zhang","J. P. Lewis","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2307.10829v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.11328"},{"id":"http://arxiv.org/abs/2307.11342v1","updated":"2023-07-21T04:15:02Z","published":"2023-07-21T04:15:02Z","title":"Tuning Pre-trained Model via Moment Probing","summary":" Recently, efficient fine-tuning of large-scale pre-trained models has\nattracted increasing research interests, where linear probing (LP) as a\nfundamental module is involved in exploiting the final representations for\ntask-dependent classification. However, most of the existing methods focus on\nhow to effectively introduce a few of learnable parameters, and little work\npays attention to the commonly used LP module. In this paper, we propose a\nnovel Moment Probing (MP) method to further explore the potential of LP.\nDistinguished from LP which builds a linear classification head based on the\nmean of final features (e.g., word tokens for ViT) or classification tokens,\nour MP performs a linear classifier on feature distribution, which provides the\nstronger representation ability by exploiting richer statistical information\ninherent in features. Specifically, we represent feature distribution by its\ncharacteristic function, which is efficiently approximated by using first- and\nsecond-order moments of features. Furthermore, we propose a multi-head\nconvolutional cross-covariance (MHC$^3$) to compute second-order moments in an\nefficient and effective manner. By considering that MP could affect feature\nlearning, we introduce a partially shared module to learn two recalibrating\nparameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive\nexperiments on ten benchmarks using various models show that our MP\nsignificantly outperforms LP and is competitive with counterparts at less\ntraining cost, while our MP$_{+}$ achieves state-of-the-art performance.\n","authors":["Mingze Gao","Qilong Wang","Zhenyi Lin","Pengfei Zhu","Qinghua Hu","Jingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11342v1.pdf","comment":"Accepted to ICCV 2023; Project Page:\n https://github.com/mingzeG/Moment-Probing"},{"id":"http://arxiv.org/abs/2307.11336v1","updated":"2023-07-21T03:50:23Z","published":"2023-07-21T03:50:23Z","title":"Character Time-series Matching For Robust License Plate Recognition","summary":" Automatic License Plate Recognition (ALPR) is becoming a popular study area\nand is applied in many fields such as transportation or smart city. However,\nthere are still several limitations when applying many current methods to\npractical problems due to the variation in real-world situations such as light\nchanges, unclear License Plate (LP) characters, and image quality. Almost\nrecent ALPR algorithms process on a single frame, which reduces accuracy in\ncase of worse image quality. This paper presents methods to improve license\nplate recognition accuracy by tracking the license plate in multiple frames.\nFirst, the Adaptive License Plate Rotation algorithm is applied to correctly\nalign the detected license plate. Second, we propose a method called Character\nTime-series Matching to recognize license plate characters from many\nconsequence frames. The proposed method archives high performance in the\nUFPR-ALPR dataset which is \\boldmath$96.7\\%$ accuracy in real-time on RTX A5000\nGPU card. We also deploy the algorithm for the Vietnamese ALPR system. The\naccuracy for license plate detection and character recognition are 0.881 and\n0.979 $mAP^{test}$@.5 respectively. The source code is available at\nhttps://github.com/chequanghuy/Character-Time-series-Matching.git\n","authors":["Quang Huy Che","Tung Do Thanh","Cuong Truong Van"],"pdf_url":"https://arxiv.org/pdf/2307.11336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11335v1","updated":"2023-07-21T03:47:28Z","published":"2023-07-21T03:47:28Z","title":"Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural\n Radiance Fields","summary":" Despite the tremendous progress in neural radiance fields (NeRF), we still\nface a dilemma of the trade-off between quality and efficiency, e.g., MipNeRF\npresents fine-detailed and anti-aliased renderings but takes days for training,\nwhile Instant-ngp can accomplish the reconstruction in a few minutes but\nsuffers from blurring or aliasing when rendering at various distances or\nresolutions due to ignoring the sampling area. To this end, we propose a novel\nTri-Mip encoding that enables both instant reconstruction and anti-aliased\nhigh-fidelity rendering for neural radiance fields. The key is to factorize the\npre-filtered 3D feature spaces in three orthogonal mipmaps. In this way, we can\nefficiently perform 3D area sampling by taking advantage of 2D pre-filtered\nfeature maps, which significantly elevates the rendering quality without\nsacrificing efficiency. To cope with the novel Tri-Mip representation, we\npropose a cone-casting rendering technique to efficiently sample anti-aliased\n3D features with the Tri-Mip encoding considering both pixel imaging and\nobserving distance. Extensive experiments on both synthetic and real-world\ndatasets demonstrate our method achieves state-of-the-art rendering quality and\nreconstruction speed while maintaining a compact representation that reduces\n25% model size compared against Instant-ngp.\n","authors":["Wenbo Hu","Yuling Wang","Lin Ma","Bangbang Yang","Lin Gao","Xiao Liu","Yuewen Ma"],"pdf_url":"https://arxiv.org/pdf/2307.11335v1.pdf","comment":"Accepted to ICCV 2023 Project page:\n https://wbhu.github.io/projects/Tri-MipRF"},{"id":"http://arxiv.org/abs/2307.11334v1","updated":"2023-07-21T03:43:07Z","published":"2023-07-21T03:43:07Z","title":"Improving Transferability of Adversarial Examples via Bayesian Attacks","summary":" This paper presents a substantial extension of our work published at ICLR.\nOur ICLR work advocated for enhancing transferability in adversarial examples\nby incorporating a Bayesian formulation into model parameters, which\neffectively emulates the ensemble of infinitely many deep neural networks,\nwhile, in this paper, we introduce a novel extension by incorporating the\nBayesian formulation into the model input as well, enabling the joint\ndiversification of both the model input and model parameters. Our empirical\nfindings demonstrate that: 1) the combination of Bayesian formulations for both\nthe model input and model parameters yields significant improvements in\ntransferability; 2) by introducing advanced approximations of the posterior\ndistribution over the model input, adversarial transferability achieves further\nenhancement, surpassing all state-of-the-arts when attacking without model\nfine-tuning. Moreover, we propose a principled approach to fine-tune model\nparameters in such an extended Bayesian formulation. The derived optimization\nobjective inherently encourages flat minima in the parameter space and input\nspace. Extensive experiments demonstrate that our method achieves a new\nstate-of-the-art on transfer-based attacks, improving the average success rate\non ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with\nour ICLR basic Bayesian method. We will make our code publicly available.\n","authors":["Qizhang Li","Yiwen Guo","Xiaochen Yang","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11323v1","updated":"2023-07-21T03:08:28Z","published":"2023-07-21T03:08:28Z","title":"HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework","summary":" In the field of autonomous driving, 3D object detection is a very important\nperception module. Although the current SOTA algorithm combines Camera and\nLidar sensors, limited by the high price of Lidar, the current mainstream\nlanding schemes are pure Camera sensors or Camera+Radar sensors. In this study,\nwe propose a new detection algorithm called HVDetFusion, which is a multi-modal\ndetection algorithm that not only supports pure camera data as input for\ndetection, but also can perform fusion input of radar data and camera data. The\ncamera stream does not depend on the input of Radar data, thus addressing the\ndownside of previous methods. In the pure camera stream, we modify the\nframework of Bevdet4D for better perception and more efficient inference, and\nthis stream has the whole 3D detection output. Further, to incorporate the\nbenefits of Radar signals, we use the prior information of different object\npositions to filter the false positive information of the original radar data,\naccording to the positioning information and radial velocity information\nrecorded by the radar sensors to supplement and fuse the BEV features generated\nby the original camera data, and the effect is further improved in the process\nof fusion training. Finally, HVDetFusion achieves the new state-of-the-art\n67.4\\% NDS on the challenging nuScenes test set among all camera-radar 3D\nobject detectors. The code is available at\nhttps://github.com/HVXLab/HVDetFusion\n","authors":["Kai Lei","Zhan Chen","Shuman Jia","Xiaoteng Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11317v1","updated":"2023-07-21T02:57:40Z","published":"2023-07-21T02:57:40Z","title":"XLDA: Linear Discriminant Analysis for Scaling Continual Learning to\n Extreme Classification at the Edge","summary":" Streaming Linear Discriminant Analysis (LDA) while proven in\nClass-incremental Learning deployments at the edge with limited classes (upto\n1000), has not been proven for deployment in extreme classification scenarios.\nIn this paper, we present: (a) XLDA, a framework for Class-IL in edge\ndeployment where LDA classifier is proven to be equivalent to FC layer\nincluding in extreme classification scenarios, and (b) optimizations to enable\nXLDA-based training and inference for edge deployment where there is a\nconstraint on available compute resources. We show up to 42x speed up using a\nbatched training approach and up to 5x inference speedup with nearest neighbor\nsearch on extreme datasets like AliProducts (50k classes) and Google Landmarks\nV2 (81k classes)\n","authors":["Karan Shah","Vishruth Veerendranath","Anushka Hebbar","Raghavendra Bhat"],"pdf_url":"https://arxiv.org/pdf/2307.11317v1.pdf","comment":"Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop"},{"id":"http://arxiv.org/abs/2307.11315v1","updated":"2023-07-21T02:47:18Z","published":"2023-07-21T02:47:18Z","title":"Generating Image-Specific Text Improves Fine-grained Image\n Classification","summary":" Recent vision-language models outperform vision-only models on many image\nclassification tasks. However, because of the absence of paired text/image\ndescriptions, it remains difficult to fine-tune these models for fine-grained\nimage classification. In this work, we propose a method, GIST, for generating\nimage-specific fine-grained text descriptions from image-only datasets, and\nshow that these text descriptions can be used to improve classification. Key\nparts of our method include 1. prompting a pretrained large language model with\ndomain-specific prompts to generate diverse fine-grained text descriptions for\neach class and 2. using a pretrained vision-language model to match each image\nto label-preserving text descriptions that capture relevant visual features in\nthe image. We demonstrate the utility of GIST by fine-tuning vision-language\nmodels on the image-and-generated-text pairs to learn an aligned\nvision-language representation space for improved classification. We evaluate\nour learned representation space in full-shot and few-shot scenarios across\nfour diverse fine-grained classification datasets, each from a different\ndomain. Our method achieves an average improvement of $4.1\\%$ in accuracy over\nCLIP linear probes and an average of $1.1\\%$ improvement in accuracy over the\nprevious state-of-the-art image-text classification method on the full-shot\ndatasets. Our method achieves similar improvements across few-shot regimes.\nCode is available at https://github.com/emu1729/GIST.\n","authors":["Emily Mu","Kathleen M. Lewis","Adrian V. Dalca","John Guttag"],"pdf_url":"https://arxiv.org/pdf/2307.11315v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2212.03434v5","updated":"2023-07-21T02:34:02Z","published":"2022-12-07T03:39:18Z","title":"Name Your Colour For the Task: Artificially Discover Colour Naming via\n Colour Quantisation Transformer","summary":" The long-standing theory that a colour-naming system evolves under dual\npressure of efficient communication and perceptual mechanism is supported by\nmore and more linguistic studies, including analysing four decades of\ndiachronic data from the Nafaanra language. This inspires us to explore whether\nmachine learning could evolve and discover a similar colour-naming system via\noptimising the communication efficiency represented by high-level recognition\nperformance. Here, we propose a novel colour quantisation transformer,\nCQFormer, that quantises colour space while maintaining the accuracy of machine\nrecognition on the quantised images. Given an RGB image, Annotation Branch maps\nit into an index map before generating the quantised image with a colour\npalette; meanwhile the Palette Branch utilises a key-point detection way to\nfind proper colours in the palette among the whole colour space. By interacting\nwith colour annotation, CQFormer is able to balance both the machine vision\naccuracy and colour perceptual structure such as distinct and stable colour\ndistribution for discovered colour system. Very interestingly, we even observe\nthe consistent evolution pattern between our artificial colour system and basic\ncolour terms across human languages. Besides, our colour quantisation method\nalso offers an efficient quantisation method that effectively compresses the\nimage storage while maintaining high performance in high-level recognition\ntasks such as classification and detection. Extensive experiments demonstrate\nthe superior performance of our method with extremely low bit-rate colours,\nshowing potential to integrate into quantisation network to quantities from\nimage to network activation. The source code is available at\nhttps://github.com/ryeocthiv/CQFormer\n","authors":["Shenghan Su","Lin Gu","Yue Yang","Zenghui Zhang","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2212.03434v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10769v3","updated":"2023-07-21T02:32:23Z","published":"2023-04-21T06:35:54Z","title":"Deep Multiview Clustering by Contrasting Cluster Assignments","summary":" Multiview clustering (MVC) aims to reveal the underlying structure of\nmultiview data by categorizing data samples into clusters. Deep learning-based\nmethods exhibit strong feature learning capabilities on large-scale datasets.\nFor most existing deep MVC methods, exploring the invariant representations of\nmultiple views is still an intractable problem. In this paper, we propose a\ncross-view contrastive learning (CVCL) method that learns view-invariant\nrepresentations and produces clustering results by contrasting the cluster\nassignments among multiple views. Specifically, we first employ deep\nautoencoders to extract view-dependent features in the pretraining stage. Then,\na cluster-level CVCL strategy is presented to explore consistent semantic label\ninformation among the multiple views in the fine-tuning stage. Thus, the\nproposed CVCL method is able to produce more discriminative cluster assignments\nby virtue of this learning strategy. Moreover, we provide a theoretical\nanalysis of soft cluster assignment alignment. Extensive experimental results\nobtained on several datasets demonstrate that the proposed CVCL method\noutperforms several state-of-the-art approaches.\n","authors":["Jie Chen","Hua Mao","Wai Lok Woo","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2304.10769v3.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.11308v1","updated":"2023-07-21T02:28:54Z","published":"2023-07-21T02:28:54Z","title":"DPM-OT: A New Diffusion Probabilistic Model Based on Optimal Transport","summary":" Sampling from diffusion probabilistic models (DPMs) can be viewed as a\npiecewise distribution transformation, which generally requires hundreds or\nthousands of steps of the inverse diffusion trajectory to get a high-quality\nimage. Recent progress in designing fast samplers for DPMs achieves a trade-off\nbetween sampling speed and sample quality by knowledge distillation or\nadjusting the variance schedule or the denoising equation. However, it can't be\noptimal in both aspects and often suffer from mode mixture in short steps. To\ntackle this problem, we innovatively regard inverse diffusion as an optimal\ntransport (OT) problem between latents at different stages and propose the\nDPM-OT, a unified learning framework for fast DPMs with a direct expressway\nrepresented by OT map, which can generate high-quality samples within around 10\nfunction evaluations. By calculating the semi-discrete optimal transport map\nbetween the data latents and the white noise, we obtain an expressway from the\nprior distribution to the data distribution, while significantly alleviating\nthe problem of mode mixture. In addition, we give the error bound of the\nproposed method, which theoretically guarantees the stability of the algorithm.\nExtensive experiments validate the effectiveness and advantages of DPM-OT in\nterms of speed and quality (FID and mode mixture), thus representing an\nefficient solution for generative modeling. Source codes are available at\nhttps://github.com/cognaclee/DPM-OT\n","authors":["Zezeng Li","ShengHao Li","Zhanpeng Wang","Na Lei","Zhongxuan Luo","Xianfeng Gu"],"pdf_url":"https://arxiv.org/pdf/2307.11308v1.pdf","comment":"iccv2023 accepted"},{"id":"http://arxiv.org/abs/2301.06262v3","updated":"2023-07-21T02:28:28Z","published":"2023-01-16T05:08:50Z","title":"Collaborative Perception in Autonomous Driving: Methods, Datasets and\n Challenges","summary":" Collaborative perception is essential to address occlusion and sensor failure\nissues in autonomous driving. In recent years, theoretical and experimental\ninvestigations of novel works for collaborative perception have increased\ntremendously. So far, however, few reviews have focused on systematical\ncollaboration modules and large-scale collaborative perception datasets. This\nwork reviews recent achievements in this field to bridge this gap and motivate\nfuture research. We start with a brief overview of collaboration schemes. After\nthat, we systematically summarize the collaborative perception methods for\nideal scenarios and real-world issues. The former focuses on collaboration\nmodules and efficiency, and the latter is devoted to addressing the problems in\nactual application. Furthermore, we present large-scale public datasets and\nsummarize quantitative results on these benchmarks. Finally, we highlight gaps\nand overlook challenges between current academic research and real-world\napplications. The project page is\nhttps://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving\n","authors":["Yushan Han","Hui Zhang","Huifang Li","Yi Jin","Congyan Lang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2301.06262v3.pdf","comment":"18 pages, 6 figures. Accepted by IEEE Intelligent Transportation\n Systems Magazine. URL:\n https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving"},{"id":"http://arxiv.org/abs/2307.11307v1","updated":"2023-07-21T02:28:20Z","published":"2023-07-21T02:28:20Z","title":"EndoSurf: Neural Surface Reconstruction of Deformable Tissues with\n Stereo Endoscope Videos","summary":" Reconstructing soft tissues from stereo endoscope videos is an essential\nprerequisite for many medical applications. Previous methods struggle to\nproduce high-quality geometry and appearance due to their inadequate\nrepresentations of 3D scenes. To address this issue, we propose a novel\nneural-field-based method, called EndoSurf, which effectively learns to\nrepresent a deforming surface from an RGBD sequence. In EndoSurf, we model\nsurface dynamics, shape, and texture with three neural fields. First, 3D points\nare transformed from the observed space to the canonical space using the\ndeformation field. The signed distance function (SDF) field and radiance field\nthen predict their SDFs and colors, respectively, with which RGBD images can be\nsynthesized via differentiable volume rendering. We constrain the learned shape\nby tailoring multiple regularization strategies and disentangling geometry and\nappearance. Experiments on public endoscope datasets demonstrate that EndoSurf\nsignificantly outperforms existing solutions, particularly in reconstructing\nhigh-fidelity shapes. Code is available at\nhttps://github.com/Ruyi-Zha/endosurf.git.\n","authors":["Ruyi Zha","Xuelian Cheng","Hongdong Li","Mehrtash Harandi","Zongyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2307.11307v1.pdf","comment":"MICCAI 2023 (Early Accept); Ruyi Zha and Xuelian Cheng made equal\n contributions. Corresponding author: Ruyi Zha (ruyi.zha@gmail.com)"},{"id":"http://arxiv.org/abs/2307.10711v2","updated":"2023-07-21T02:06:41Z","published":"2023-07-20T09:06:21Z","title":"AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of\n Diffusion Probabilistic Models","summary":" Existing customization methods require access to multiple reference examples\nto align pre-trained diffusion probabilistic models (DPMs) with user-provided\nconcepts. This paper aims to address the challenge of DPM customization when\nthe only available supervision is a differentiable metric defined on the\ngenerated contents. Since the sampling procedure of DPMs involves recursive\ncalls to the denoising UNet, na\\\"ive gradient backpropagation requires storing\nthe intermediate states of all iterations, resulting in extremely high memory\nconsumption. To overcome this issue, we propose a novel method AdjointDPM,\nwhich first generates new samples from diffusion models by solving the\ncorresponding probability-flow ODEs. It then uses the adjoint sensitivity\nmethod to backpropagate the gradients of the loss to the models' parameters\n(including conditioning signals, network weights, and initial noises) by\nsolving another augmented ODE. To reduce numerical errors in both the forward\ngeneration and gradient backpropagation processes, we further reparameterize\nthe probability-flow ODE and augmented ODE as simple non-stiff ODEs using\nexponential integration. Finally, we demonstrate the effectiveness of\nAdjointDPM on three interesting tasks: converting visual effects into\nidentification text embeddings, finetuning DPMs for specific types of\nstylization, and optimizing initial noise to generate adversarial samples for\nsecurity auditing.\n","authors":["Jiachun Pan","Jun Hao Liew","Vincent Y. F. Tan","Jiashi Feng","Hanshu Yan"],"pdf_url":"https://arxiv.org/pdf/2307.10711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04973v2","updated":"2023-07-21T01:40:31Z","published":"2023-02-09T23:25:28Z","title":"Invariant Slot Attention: Object Discovery with Slot-Centric Reference\n Frames","summary":" Automatically discovering composable abstractions from raw perceptual data is\na long-standing challenge in machine learning. Recent slot-based neural\nnetworks that learn about objects in a self-supervised manner have made\nexciting progress in this direction. However, they typically fall short at\nadequately capturing spatial symmetries present in the visual world, which\nleads to sample inefficiency, such as when entangling object appearance and\npose. In this paper, we present a simple yet highly effective method for\nincorporating spatial symmetries via slot-centric reference frames. We\nincorporate equivariance to per-object pose transformations into the attention\nand generation mechanism of Slot Attention by translating, scaling, and\nrotating position encodings. These changes result in little computational\noverhead, are easy to implement, and can result in large gains in terms of data\nefficiency and overall improvements to object discovery. We evaluate our method\non a wide range of synthetic object discovery benchmarks namely CLEVR,\nTetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising\nimprovements on the challenging real-world Waymo Open dataset.\n","authors":["Ondrej Biza","Sjoerd van Steenkiste","Mehdi S. M. Sajjadi","Gamaleldin F. Elsayed","Aravindh Mahendran","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2302.04973v2.pdf","comment":"Accepted at ICML 2023. Project page: https://invariantsa.github.io/"},{"id":"http://arxiv.org/abs/2307.11285v1","updated":"2023-07-21T01:04:52Z","published":"2023-07-21T01:04:52Z","title":"MAS: Towards Resource-Efficient Federated Multiple-Task Learning","summary":" Federated learning (FL) is an emerging distributed machine learning method\nthat empowers in-situ model training on decentralized edge devices. However,\nmultiple simultaneous FL tasks could overload resource-constrained devices. In\nthis work, we propose the first FL system to effectively coordinate and train\nmultiple simultaneous FL tasks. We first formalize the problem of training\nsimultaneous FL tasks. Then, we present our new approach, MAS (Merge and\nSplit), to optimize the performance of training multiple simultaneous FL tasks.\nMAS starts by merging FL tasks into an all-in-one FL task with a multi-task\narchitecture. After training for a few rounds, MAS splits the all-in-one FL\ntask into two or more FL tasks by using the affinities among tasks measured\nduring the all-in-one training. It then continues training each split of FL\ntasks based on model parameters from the all-in-one training. Extensive\nexperiments demonstrate that MAS outperforms other methods while reducing\ntraining time by 2x and reducing energy consumption by 40%. We hope this work\nwill inspire the community to further study and optimize training simultaneous\nFL tasks.\n","authors":["Weiming Zhuang","Yonggang Wen","Lingjuan Lyu","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11285v1.pdf","comment":"ICCV'23. arXiv admin note: substantial text overlap with\n arXiv:2207.04202"},{"id":"http://arxiv.org/abs/2307.11274v1","updated":"2023-07-21T00:15:56Z","published":"2023-07-21T00:15:56Z","title":"Screening Mammography Breast Cancer Detection","summary":" Breast cancer is a leading cause of cancer-related deaths, but current\nprograms are expensive and prone to false positives, leading to unnecessary\nfollow-up and patient anxiety. This paper proposes a solution to automated\nbreast cancer detection, to improve the efficiency and accuracy of screening\nprograms. Different methodologies were tested against the RSNA dataset of\nradiographic breast images of roughly 20,000 female patients and yielded an\naverage validation case pF1 score of 0.56 across methods.\n","authors":["Debajyoti Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2307.11274v1.pdf","comment":"Released @ Apr 2023. For associated project files, see\n https://github.com/chakrabortyde/rsna-breast-cancer"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.06576v3","updated":"2023-07-21T16:06:32Z","published":"2023-07-13T06:25:22Z","title":"Going Beyond Local: Global Graph-Enhanced Personalized News\n Recommendations","summary":" Precisely recommending candidate news articles to users has always been a\ncore challenge for personalized news recommendation systems. Most recent works\nprimarily focus on using advanced natural language processing techniques to\nextract semantic information from rich textual data, employing content-based\nmethods derived from local historical news. However, this approach lacks a\nglobal perspective, failing to account for users' hidden motivations and\nbehaviors beyond semantic information. To address this challenge, we propose a\nnovel model called GLORY (Global-LOcal news Recommendation sYstem), which\ncombines global representations learned from other users with local\nrepresentations to enhance personalized recommendation systems. We accomplish\nthis by constructing a Global-aware Historical News Encoder, which includes a\nglobal news graph and employs gated graph neural networks to enrich news\nrepresentations, thereby fusing historical news representations by a historical\nnews aggregator. Similarly, we extend this approach to a Global Candidate News\nEncoder, utilizing a global entity graph and a candidate news aggregator to\nenhance candidate news representation. Evaluation results on two public news\ndatasets demonstrate that our method outperforms existing approaches.\nFurthermore, our model offers more diverse recommendations.\n","authors":["Boming Yang","Dairui Liu","Toyotaro Suzumura","Ruihai Dong","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2307.06576v3.pdf","comment":"10 pages, Recsys 2023"},{"id":"http://arxiv.org/abs/2307.11650v1","updated":"2023-07-21T15:28:47Z","published":"2023-07-21T15:28:47Z","title":"Alleviating the Long-Tail Problem in Conversational Recommender Systems","summary":" Conversational recommender systems (CRS) aim to provide the recommendation\nservice via natural language conversations. To develop an effective CRS,\nhigh-quality CRS datasets are very crucial. However, existing CRS datasets\nsuffer from the long-tail issue, \\ie a large proportion of items are rarely (or\neven never) mentioned in the conversations, which are called long-tail items.\nAs a result, the CRSs trained on these datasets tend to recommend frequent\nitems, and the diversity of the recommended items would be largely reduced,\nmaking users easier to get bored.\n To address this issue, this paper presents \\textbf{LOT-CRS}, a novel\nframework that focuses on simulating and utilizing a balanced CRS dataset (\\ie\ncovering all the items evenly) for improving \\textbf{LO}ng-\\textbf{T}ail\nrecommendation performance of CRSs. In our approach, we design two pre-training\ntasks to enhance the understanding of simulated conversation for long-tail\nitems, and adopt retrieval-augmented fine-tuning with label smoothness strategy\nto further improve the recommendation of long-tail items. Extensive experiments\non two public CRS datasets have demonstrated the effectiveness and\nextensibility of our approach, especially on long-tail recommendation.\n","authors":["Zhipeng Zhao","Kun Zhou","Xiaolei Wang","Wayne Xin Zhao","Fan Pan","Zhao Cao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2307.11650v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2307.11496v1","updated":"2023-07-21T11:04:20Z","published":"2023-07-21T11:04:20Z","title":"Identifying document similarity using a fast estimation of the\n Levenshtein Distance based on compression and signatures","summary":" Identifying document similarity has many applications, e.g., source code\nanalysis or plagiarism detection. However, identifying similarities is not\ntrivial and can be time complex. For instance, the Levenshtein Distance is a\ncommon metric to define the similarity between two documents but has quadratic\nruntime which makes it impractical for large documents where large starts with\na few hundred kilobytes. In this paper, we present a novel concept that allows\nestimating the Levenshtein Distance: the algorithm first compresses documents\nto signatures (similar to hash values) using a user-defined compression ratio.\nSignatures can then be compared against each other (some constrains apply)\nwhere the outcome is the estimated Levenshtein Distance. Our evaluation shows\npromising results in terms of runtime efficiency and accuracy. In addition, we\nintroduce a significance score allowing examiners to set a threshold and\nidentify related documents.\n","authors":["Peter Coates","Frank Breitinger"],"pdf_url":"https://arxiv.org/pdf/2307.11496v1.pdf","comment":"In: Proceedings of the Digital Forensics Research Conference Europe\n (DFRWS EU). 2022"},{"id":"http://arxiv.org/abs/2307.10617v2","updated":"2023-07-21T09:49:15Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":" In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v2.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.02250v2","updated":"2023-07-21T07:46:03Z","published":"2023-06-04T03:46:45Z","title":"Large Language Model Augmented Narrative Driven Recommendations","summary":" Narrative-driven recommendation (NDR) presents an information access problem\nwhere users solicit recommendations with verbose descriptions of their\npreferences and context, for example, travelers soliciting recommendations for\npoints of interest while describing their likes/dislikes and travel\ncircumstances. These requests are increasingly important with the rise of\nnatural language-based conversational interfaces for search and recommendation\nsystems. However, NDR lacks abundant training data for models, and current\nplatforms commonly do not support these requests. Fortunately, classical\nuser-item interaction datasets contain rich textual data, e.g., reviews, which\noften describe user preferences and context - this may be used to bootstrap\ntraining for NDR models. In this work, we explore using large language models\n(LLMs) for data augmentation to train NDR models. We use LLMs for authoring\nsynthetic narrative queries from user-item interactions with few-shot prompting\nand train retrieval models for NDR on synthetic queries and user-item\ninteraction data. Our experiments demonstrate that this is an effective\nstrategy for training small-parameter retrieval models that outperform other\nretrieval and LLM baselines for narrative-driven recommendation.\n","authors":["Sheshera Mysore","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2306.02250v2.pdf","comment":"RecSys 2023 Camera-ready"},{"id":"http://arxiv.org/abs/2304.04250v2","updated":"2023-07-21T07:39:58Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":" Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v2.pdf","comment":"SIGIR-2023 Camera Ready"},{"id":"http://arxiv.org/abs/2307.11325v1","updated":"2023-07-21T03:23:17Z","published":"2023-07-21T03:23:17Z","title":"Analysis of Elephant Movement in Sub-Saharan Africa: Ecological,\n Climatic, and Conservation Perspectives","summary":" The interaction between elephants and their environment has profound\nimplications for both ecology and conservation strategies. This study presents\nan analytical approach to decipher the intricate patterns of elephant movement\nin Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal\nvariations and rainfall patterns. Despite the complexities surrounding these\ninfluential factors, our analysis provides a holistic view of elephant\nmigratory behavior in the context of the dynamic African landscape. Our\ncomprehensive approach enables us to predict the potential impact of these\necological determinants on elephant migration, a critical step in establishing\ninformed conservation strategies. This projection is particularly crucial given\nthe impacts of global climate change on seasonal and rainfall patterns, which\ncould substantially influence elephant movements in the future. The findings of\nour work aim to not only advance the understanding of movement ecology but also\nfoster a sustainable coexistence of humans and elephants in Sub-Saharan Africa.\nBy predicting potential elephant routes, our work can inform strategies to\nminimize human-elephant conflict, effectively manage land use, and enhance\nanti-poaching efforts. This research underscores the importance of integrating\nmovement ecology and climatic variables for effective wildlife management and\nconservation planning.\n","authors":["Matthew Hines","Gregory Glatzer","Shreya Ghosh","Prasenjit Mitra"],"pdf_url":"https://arxiv.org/pdf/2307.11325v1.pdf","comment":"11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on\n Computing and Sustainable Societies (COMPASS 2023)"},{"id":"http://arxiv.org/abs/2307.10479v2","updated":"2023-07-21T19:24:10Z","published":"2023-07-19T22:20:06Z","title":"Fast Approximate Nearest Neighbor Search with a Dynamic Exploration\n Graph using Continuous Refinement","summary":" For approximate nearest neighbor search, graph-based algorithms have shown to\noffer the best trade-off between accuracy and search time. We propose the\nDynamic Exploration Graph (DEG) which significantly outperforms existing\nalgorithms in terms of search and exploration efficiency by combining two new\nideas: First, a single undirected even regular graph is incrementally built by\npartially replacing existing edges to integrate new vertices and to update old\nneighborhoods at the same time. Secondly, an edge optimization algorithm is\nused to continuously improve the quality of the graph. Combining this ongoing\nrefinement with the graph construction process leads to a well-organized graph\nstructure at all times, resulting in: (1) increased search efficiency, (2)\npredictable index size, (3) guaranteed connectivity and therefore reachability\nof all vertices, and (4) a dynamic graph structure. In addition we investigate\nhow well existing graph-based search systems can handle indexed queries where\nthe seed vertex of a search is the query itself. Such exploration tasks,\ndespite their good starting point, are not necessarily easy. High efficiency in\napproximate nearest neighbor search (ANNS) does not automatically imply good\nperformance in exploratory search. Extensive experiments show that our new\nDynamic Exploration Graph outperforms existing algorithms significantly for\nindexed and unindexed queries.\n","authors":["Nico Hezel","Kai Uwe Barthel","Konstantin Schall","Klaus Jung"],"pdf_url":"https://arxiv.org/pdf/2307.10479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11848v1","updated":"2023-07-21T18:35:24Z","published":"2023-07-21T18:35:24Z","title":"MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through\n Multi-Answer Open-Domain Question Answering","summary":" Check-worthy claim detection aims at providing plausible misinformation to\ndownstream fact-checking systems or human experts to check. This is a crucial\nstep toward accelerating the fact-checking process. Many efforts have been put\ninto how to identify check-worthy claims from a small scale of pre-collected\nclaims, but how to efficiently detect check-worthy claims directly from a\nlarge-scale information source, such as Twitter, remains underexplored. To fill\nthis gap, we introduce MythQA, a new multi-answer open-domain question\nanswering(QA) task that involves contradictory stance mining for query-based\nlarge-scale check-worthy claim detection. The idea behind this is that\ncontradictory claims are a strong indicator of misinformation that merits\nscrutiny by the appropriate authorities. To study this task, we construct\nTweetMythQA, an evaluation dataset containing 522 factoid multi-answer\nquestions based on controversial topics. Each question is annotated with\nmultiple answers. Moreover, we collect relevant tweets for each distinct\nanswer, then classify them into three categories: \"Supporting\", \"Refuting\", and\n\"Neutral\". In total, we annotated 5.3K tweets. Contradictory evidence is\ncollected for all answers in the dataset. Finally, we present a baseline system\nfor MythQA and evaluate existing NLP models for each system component using the\nTweetMythQA dataset. We provide initial benchmarks and identify key challenges\nfor future models to improve upon. Code and data are available at:\nhttps://github.com/TonyBY/Myth-QA\n","authors":["Yang Bai","Anthony Colas","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11848v1.pdf","comment":"Accepted by SIGIR 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.11749v1","updated":"2023-07-21T17:59:15Z","published":"2023-07-21T17:59:15Z","title":"Differentially Private Heavy Hitter Detection using Federated Analytics","summary":" In this work, we study practical heuristics to improve the performance of\nprefix-tree based algorithms for differentially private heavy hitter detection.\nOur model assumes each user has multiple data points and the goal is to learn\nas many of the most frequent data points as possible across all users' data\nwith aggregate and local differential privacy. We propose an adaptive\nhyperparameter tuning algorithm that improves the performance of the algorithm\nwhile satisfying computational, communication and privacy constraints. We\nexplore the impact of different data-selection schemes as well as the impact of\nintroducing deny lists during multiple runs of the algorithm. We test these\nimprovements using extensive experimentation on the Reddit\ndataset~\\cite{caldas2018leaf} on the task of learning the most frequent words.\n","authors":["Karan Chadha","Junye Chen","John Duchi","Vitaly Feldman","Hanieh Hashemi","Omid Javidbakht","Audra McMillan","Kunal Talwar"],"pdf_url":"https://arxiv.org/pdf/2307.11749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03327v2","updated":"2023-07-21T17:54:14Z","published":"2023-03-06T17:54:33Z","title":"Tight Bounds for $γ$-Regret via the Decision-Estimation Coefficient","summary":" In this work, we give a statistical characterization of the $\\gamma$-regret\nfor arbitrary structured bandit problems, the regret which arises when\ncomparing against a benchmark that is $\\gamma$ times the optimal solution. The\n$\\gamma$-regret emerges in structured bandit problems over a function class\n$\\mathcal{F}$ where finding an exact optimum of $f \\in \\mathcal{F}$ is\nintractable. Our characterization is given in terms of the $\\gamma$-DEC, a\nstatistical complexity parameter for the class $\\mathcal{F}$, which is a\nmodification of the constrained Decision-Estimation Coefficient (DEC) of Foster\net al., 2023 (and closely related to the original offset DEC of Foster et al.,\n2021). Our lower bound shows that the $\\gamma$-DEC is a fundamental limit for\nany model class $\\mathcal{F}$: for any algorithm, there exists some $f \\in\n\\mathcal{F}$ for which the $\\gamma$-regret of that algorithm scales (nearly)\nwith the $\\gamma$-DEC of $\\mathcal{F}$. We provide an upper bound showing that\nthere exists an algorithm attaining a nearly matching $\\gamma$-regret. Due to\nsignificant challenges in applying the prior results on the DEC to the\n$\\gamma$-regret case, both our lower and upper bounds require novel techniques\nand a new algorithm.\n","authors":["Margalit Glasgow","Alexander Rakhlin"],"pdf_url":"https://arxiv.org/pdf/2303.03327v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11732v1","updated":"2023-07-21T17:45:28Z","published":"2023-07-21T17:45:28Z","title":"Advancing Ad Auction Realism: Practical Insights & Modeling Implications","summary":" This paper proposes a learning model of online ad auctions that allows for\nthe following four key realistic characteristics of contemporary online\nauctions: (1) ad slots can have different values and click-through rates\ndepending on users' search queries, (2) the number and identity of competing\nadvertisers are unobserved and change with each auction, (3) advertisers only\nreceive partial, aggregated feedback, and (4) payment rules are only partially\nspecified. We model advertisers as agents governed by an adversarial bandit\nalgorithm, independent of auction mechanism intricacies. Our objective is to\nsimulate the behavior of advertisers for counterfactual analysis, prediction,\nand inference purposes. Our findings reveal that, in such richer environments,\n\"soft floors\" can enhance key performance metrics even when bidders are drawn\nfrom the same population. We further demonstrate how to infer advertiser value\ndistributions from observed bids, thereby affirming the practical efficacy of\nour approach even in a more realistic auction setting.\n","authors":["Ming Chen","Sareh Nabi","Marciano Siniscalchi"],"pdf_url":"https://arxiv.org/pdf/2307.11732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11730v1","updated":"2023-07-21T17:43:50Z","published":"2023-07-21T17:43:50Z","title":"Mitigating Communications Threats in Decentralized Federated Learning\n through Moving Target Defense","summary":" The rise of Decentralized Federated Learning (DFL) has enabled the training\nof machine learning models across federated participants, fostering\ndecentralized model aggregation and reducing dependence on a server. However,\nthis approach introduces unique communication security challenges that have yet\nto be thoroughly addressed in the literature. These challenges primarily\noriginate from the decentralized nature of the aggregation process, the varied\nroles and responsibilities of the participants, and the absence of a central\nauthority to oversee and mitigate threats. Addressing these challenges, this\npaper first delineates a comprehensive threat model, highlighting the potential\nrisks of DFL communications. In response to these identified risks, this work\nintroduces a security module designed for DFL platforms to counter\ncommunication-based attacks. The module combines security techniques such as\nsymmetric and asymmetric encryption with Moving Target Defense (MTD)\ntechniques, including random neighbor selection and IP/port switching. The\nsecurity module is implemented in a DFL platform called Fedstellar, allowing\nthe deployment and monitoring of the federation. A DFL scenario has been\ndeployed, involving eight physical devices implementing three security\nconfigurations: (i) a baseline with no security, (ii) an encrypted\nconfiguration, and (iii) a configuration integrating both encryption and MTD\ntechniques. The effectiveness of the security module is validated through\nexperiments with the MNIST dataset and eclipse attacks. The results indicated\nan average F1 score of 95%, with moderate increases in CPU usage (up to 63.2%\n+-3.5%) and network traffic (230 MB +-15 MB) under the most secure\nconfiguration, mitigating the risks posed by eavesdropping or eclipse attacks.\n","authors":["Enrique Tomás Martínez Beltrán","Pedro Miguel Sánchez Sánchez","Sergio López Bernal","Gérôme Bovet","Manuel Gil Pérez","Gregorio Martínez Pérez","Alberto Huertas Celdrán"],"pdf_url":"https://arxiv.org/pdf/2307.11730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10870v2","updated":"2023-07-21T17:37:26Z","published":"2023-02-21T18:34:51Z","title":"On Provable Copyright Protection for Generative Models","summary":" There is a growing concern that learned conditional generative models may\noutput samples that are substantially similar to some copyrighted data $C$ that\nwas in their training set. We give a formal definition of $\\textit{near\naccess-freeness (NAF)}$ and prove bounds on the probability that a model\nsatisfying this definition outputs a sample similar to $C$, even if $C$ is\nincluded in its training set. Roughly speaking, a generative model $p$ is\n$\\textit{$k$-NAF}$ if for every potentially copyrighted data $C$, the output of\n$p$ diverges by at most $k$-bits from the output of a model $q$ that\n$\\textit{did not access $C$ at all}$. We also give generative model learning\nalgorithms, which efficiently modify the original generative model learning\nalgorithm in a black box manner, that output generative models with strong\nbounds on the probability of sampling protected content. Furthermore, we\nprovide promising experiments for both language (transformers) and image\n(diffusion) generative models, showing minimal degradation in output quality\nwhile ensuring strong protections against sampling protected content.\n","authors":["Nikhil Vyas","Sham Kakade","Boaz Barak"],"pdf_url":"https://arxiv.org/pdf/2302.10870v2.pdf","comment":"Accepted at ICML 2023"},{"id":"http://arxiv.org/abs/2307.10496v2","updated":"2023-07-21T17:34:51Z","published":"2023-07-19T23:29:40Z","title":"A Competitive Learning Approach for Specialized Models: A Solution for\n Complex Physical Systems with Distinct Functional Regimes","summary":" Complex systems in science and engineering sometimes exhibit behavior that\nchanges across different regimes. Traditional global models struggle to capture\nthe full range of this complex behavior, limiting their ability to accurately\nrepresent the system. In response to this challenge, we propose a novel\ncompetitive learning approach for obtaining data-driven models of physical\nsystems. The primary idea behind the proposed approach is to employ dynamic\nloss functions for a set of models that are trained concurrently on the data.\nEach model competes for each observation during training, allowing for the\nidentification of distinct functional regimes within the dataset. To\ndemonstrate the effectiveness of the learning approach, we coupled it with\nvarious regression methods that employ gradient-based optimizers for training.\nThe proposed approach was tested on various problems involving model discovery\nand function approximation, demonstrating its ability to successfully identify\nfunctional regimes, discover true governing equations, and reduce test errors.\n","authors":["Okezzi F. Ukorigho","Opeoluwa Owoyele"],"pdf_url":"https://arxiv.org/pdf/2307.10496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08736v2","updated":"2023-07-21T17:21:57Z","published":"2022-12-16T22:18:48Z","title":"A Neural Network Warm-Start Approach for the Inverse Acoustic Obstacle\n Scattering Problem","summary":" We consider the inverse acoustic obstacle problem for sound-soft star-shaped\nobstacles in two dimensions wherein the boundary of the obstacle is determined\nfrom measurements of the scattered field at a collection of receivers outside\nthe object. One of the standard approaches for solving this problem is to\nreformulate it as an optimization problem: finding the boundary of the domain\nthat minimizes the $L^2$ distance between computed values of the scattered\nfield and the given measurement data. The optimization problem is\ncomputationally challenging since the local set of convexity shrinks with\nincreasing frequency and results in an increasing number of local minima in the\nvicinity of the true solution. In many practical experimental settings, low\nfrequency measurements are unavailable due to limitations of the experimental\nsetup or the sensors used for measurement. Thus, obtaining a good initial guess\nfor the optimization problem plays a vital role in this environment.\n We present a neural network warm-start approach for solving the inverse\nscattering problem, where an initial guess for the optimization problem is\nobtained using a trained neural network. We demonstrate the effectiveness of\nour method with several numerical examples. For high frequency problems, this\napproach outperforms traditional iterative methods such as Gauss-Newton\ninitialized without any prior (i.e., initialized using a unit circle), or\ninitialized using the solution of a direct method such as the linear sampling\nmethod. The algorithm remains robust to noise in the scattered field\nmeasurements and also converges to the true solution for limited aperture data.\nHowever, the number of training samples required to train the neural network\nscales exponentially in frequency and the complexity of the obstacles\nconsidered. We conclude with a discussion of this phenomenon and potential\ndirections for future research.\n","authors":["Mo Zhou","Jiequn Han","Manas Rachh","Carlos Borges"],"pdf_url":"https://arxiv.org/pdf/2212.08736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15471v3","updated":"2023-07-21T17:20:16Z","published":"2023-03-25T10:21:13Z","title":"Embedding Contextual Information through Reward Shaping in Multi-Agent\n Learning: A Case Study from Google Football","summary":" Artificial Intelligence has been used to help human complete difficult tasks\nin complicated environments by providing optimized strategies for\ndecision-making or replacing the manual labour. In environments including\nmultiple agents, such as football, the most common methods to train agents are\nImitation Learning and Multi-Agent Reinforcement Learning (MARL). However, the\nagents trained by Imitation Learning cannot outperform the expert demonstrator,\nwhich makes humans hardly get new insights from the learnt policy. Besides,\nMARL is prone to the credit assignment problem. In environments with sparse\nreward signal, this method can be inefficient. The objective of our research is\nto create a novel reward shaping method by embedding contextual information in\nreward function to solve the aforementioned challenges. We demonstrate this in\nthe Google Research Football (GRF) environment. We quantify the contextual\ninformation extracted from game state observation and use this quantification\ntogether with original sparse reward to create the shaped reward. The\nexperiment results in the GRF environment prove that our reward shaping method\nis a useful addition to state-of-the-art MARL algorithms for training agents in\nenvironments with sparse reward signal.\n","authors":["Chaoyi Gu","Varuna De Silva","Corentin Artaud","Rafael Pina"],"pdf_url":"https://arxiv.org/pdf/2303.15471v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11714v1","updated":"2023-07-21T17:19:01Z","published":"2023-07-21T17:19:01Z","title":"Convergence of SGD for Training Neural Networks with Sliced Wasserstein\n Losses","summary":" Optimal Transport has sparked vivid interest in recent years, in particular\nthanks to the Wasserstein distance, which provides a geometrically sensible and\nintuitive way of comparing probability measures. For computational reasons, the\nSliced Wasserstein (SW) distance was introduced as an alternative to the\nWasserstein distance, and has seen uses for training generative Neural Networks\n(NNs). While convergence of Stochastic Gradient Descent (SGD) has been observed\npractically in such a setting, there is to our knowledge no theoretical\nguarantee for this observation. Leveraging recent works on convergence of SGD\non non-smooth and non-convex functions by Bianchi et al. (2022), we aim to\nbridge that knowledge gap, and provide a realistic context under which\nfixed-step SGD trajectories for the SW loss on NN parameters converge. More\nprecisely, we show that the trajectories approach the set of (sub)-gradient\nflow equations as the step decreases. Under stricter assumptions, we show a\nmuch stronger convergence result for noised and projected SGD schemes, namely\nthat the long-run limits of the trajectories approach a set of generalised\ncritical points of the loss function.\n","authors":["Eloi Tanguy"],"pdf_url":"https://arxiv.org/pdf/2307.11714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11704v1","updated":"2023-07-21T17:00:06Z","published":"2023-07-21T17:00:06Z","title":"JoinGym: An Efficient Query Optimization Environment for Reinforcement\n Learning","summary":" In this paper, we present \\textsc{JoinGym}, an efficient and lightweight\nquery optimization environment for reinforcement learning (RL). Join order\nselection (JOS) is a classic NP-hard combinatorial optimization problem from\ndatabase query optimization and can serve as a practical testbed for the\ngeneralization capabilities of RL algorithms. We describe how to formulate each\nof the left-deep and bushy variants of the JOS problem as a Markov Decision\nProcess (MDP), and we provide an implementation adhering to the standard\nGymnasium API. We highlight that our implementation \\textsc{JoinGym} is\ncompletely based on offline traces of all possible joins, which enables RL\npractitioners to easily and quickly test their methods on a realistic data\nmanagement problem without needing to setup any systems. Moreover, we also\nprovide all possible join traces on $3300$ novel SQL queries generated from the\nIMDB dataset. Upon benchmarking popular RL algorithms, we find that at least\none method can obtain near-optimal performance on train-set queries but their\nperformance degrades by several orders of magnitude on test-set queries. This\ngap motivates further research for RL algorithms that generalize well in\nmulti-task combinatorial optimization problems.\n","authors":["Kaiwen Wang","Junxiong Wang","Yueying Li","Nathan Kallus","Immanuel Trummer","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2307.11704v1.pdf","comment":"We will make all the queries available soon"},{"id":"http://arxiv.org/abs/2307.10490v2","updated":"2023-07-21T16:51:15Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n Multi-Modal LLMs","summary":" We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11695v1","updated":"2023-07-21T16:50:10Z","published":"2023-07-21T16:50:10Z","title":"Using simulation to calibrate real data acquisition in veterinary\n medicine","summary":" This paper explores the innovative use of simulation environments to enhance\ndata acquisition and diagnostics in veterinary medicine, focusing specifically\non gait analysis in dogs. The study harnesses the power of Blender and the\nBlenderproc library to generate synthetic datasets that reflect diverse\nanatomical, environmental, and behavioral conditions. The generated data,\nrepresented in graph form and standardized for optimal analysis, is utilized to\ntrain machine learning algorithms for identifying normal and abnormal gaits.\nTwo distinct datasets with varying degrees of camera angle granularity are\ncreated to further investigate the influence of camera perspective on model\naccuracy. Preliminary results suggest that this simulation-based approach holds\npromise for advancing veterinary diagnostics by enabling more precise data\nacquisition and more effective machine learning models. By integrating\nsynthetic and real-world patient data, the study lays a robust foundation for\nimproving overall effectiveness and efficiency in veterinary medicine.\n","authors":["Krystian Strzałka","Szymon Mazurek","Maciej Wielgosz","Paweł Russek","Jakub Caputa","Daria Łukasik","Jan Krupiński","Jakub Grzeszczyk","Michał Karwatowski","Rafał Frączek","Ernest Jamro","Marcin Pietroń","Sebastian Koryciak","Agnieszka Dąbrowska-Boruch","Kazimierz Wiatr"],"pdf_url":"https://arxiv.org/pdf/2307.11695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11672v1","updated":"2023-07-21T16:18:58Z","published":"2023-07-21T16:18:58Z","title":"Fast Adaptive Test-Time Defense with Robust Features","summary":" Adaptive test-time defenses are used to improve the robustness of deep neural\nnetworks to adversarial examples. However, existing methods significantly\nincrease the inference time due to additional optimization on the model\nparameters or the input at test time. In this work, we propose a novel adaptive\ntest-time defense strategy that is easy to integrate with any existing (robust)\ntraining procedure without additional test-time computation. Based on the\nnotion of robustness of features that we present, the key idea is to project\nthe trained models to the most robust feature space, thereby reducing the\nvulnerability to adversarial attacks in non-robust directions. We theoretically\nshow that the top eigenspace of the feature matrix are more robust for a\ngeneralized additive model and support our argument for a large width neural\nnetwork with the Neural Tangent Kernel (NTK) equivalence. We conduct extensive\nexperiments on CIFAR-10 and CIFAR-100 datasets for several robustness\nbenchmarks, including the state-of-the-art methods in RobustBench, and observe\nthat the proposed method outperforms existing adaptive test-time defenses at\nmuch lower computation costs.\n","authors":["Anurag Singh","Mahalakshmi Sabanayagam","Krikamol Muandet","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2307.11672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17282v2","updated":"2023-07-21T16:15:21Z","published":"2023-05-26T22:01:47Z","title":"Universal consistency of the $k$-NN rule in metric spaces and Nagata\n dimension. II","summary":" We continue to investigate the $k$ nearest neighbour learning rule in\nseparable metric spaces. Thanks to the results of C\\'erou and Guyader (2006)\nand Preiss (1983), this rule is known to be universally consistent in every\nmetric space $X$ that is sigma-finite dimensional in the sense of Nagata. Here\nwe show that the rule is strongly universally consistent in such spaces in the\nabsence of ties. Under the tie-breaking strategy applied by Devroye,\nGy\\\"{o}rfi, Krzy\\.{z}ak, and Lugosi (1994) in the Euclidean setting, we manage\nto show the strong universal consistency in non-Archimedian metric spaces (that\nis, those of Nagata dimension zero). Combining the theorem of C\\'erou and\nGuyader with results of Assouad and Quentin de Gromard (2006), one deduces that\nthe $k$-NN rule is universally consistent in metric spaces having finite\ndimension in the sense of de Groot. In particular, the $k$-NN rule is\nuniversally consistent in the Heisenberg group which is not sigma-finite\ndimensional in the sense of Nagata as follows from an example independently\nconstructed by Kor\\'anyi and Reimann (1995) and Sawyer and Wheeden (1992).\n","authors":["Sushma Kumari","Vladimir G. Pestov"],"pdf_url":"https://arxiv.org/pdf/2305.17282v2.pdf","comment":"Latex 2e, 17 pages. The Heisenberg group is now presented in more\n detail, with some proofs and more references added, and a discussion of open\n problems added at the end"},{"id":"http://arxiv.org/abs/2307.11668v1","updated":"2023-07-21T16:12:46Z","published":"2023-07-21T16:12:46Z","title":"An Efficient Interior-Point Method for Online Convex Optimization","summary":" A new algorithm for regret minimization in online convex optimization is\ndescribed. The regret of the algorithm after $T$ time periods is $O(\\sqrt{T\n\\log T})$ - which is the minimum possible up to a logarithmic term. In\naddition, the new algorithm is adaptive, in the sense that the regret bounds\nhold not only for the time periods $1,\\ldots,T$ but also for every sub-interval\n$s,s+1,\\ldots,t$. The running time of the algorithm matches that of newly\nintroduced interior point algorithms for regret minimization: in\n$n$-dimensional space, during each iteration the new algorithm essentially\nsolves a system of linear equations of order $n$, rather than solving some\nconstrained convex optimization problem in $n$ dimensions and possibly many\nconstraints.\n","authors":["Elad Hazan","Nimrod Megiddo"],"pdf_url":"https://arxiv.org/pdf/2307.11668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.14778v2","updated":"2023-07-21T16:05:34Z","published":"2021-09-30T00:58:27Z","title":"CALDA: Improving Multi-Source Time Series Domain Adaptation with\n Contrastive Adversarial Learning","summary":" Unsupervised domain adaptation (UDA) provides a strategy for improving\nmachine learning performance in data-rich (target) domains where ground truth\nlabels are inaccessible but can be found in related (source) domains. In cases\nwhere meta-domain information such as label distributions is available, weak\nsupervision can further boost performance. We propose a novel framework, CALDA,\nto tackle these two problems. CALDA synergistically combines the principles of\ncontrastive learning and adversarial learning to robustly support multi-source\nUDA (MS-UDA) for time series data. Similar to prior methods, CALDA utilizes\nadversarial learning to align source and target feature representations. Unlike\nprior approaches, CALDA additionally leverages cross-source label information\nacross domains. CALDA pulls examples with the same label close to each other,\nwhile pushing apart examples with different labels, reshaping the space through\ncontrastive learning. Unlike prior contrastive adaptation methods, CALDA\nrequires neither data augmentation nor pseudo labeling, which may be more\nchallenging for time series. We empirically validate our proposed approach.\nBased on results from human activity recognition, electromyography, and\nsynthetic datasets, we find utilizing cross-source information improves\nperformance over prior time series and contrastive methods. Weak supervision\nfurther improves performance, even in the presence of noise, allowing CALDA to\noffer generalizable strategies for MS-UDA. Code is available at:\nhttps://github.com/floft/calda\n","authors":["Garrett Wilson","Janardhan Rao Doppa","Diane J. Cook"],"pdf_url":"https://arxiv.org/pdf/2109.14778v2.pdf","comment":"Accepted at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2307.11661v1","updated":"2023-07-21T15:49:59Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":" Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. We will release the code, prompts, and auxiliary text\ndataset upon acceptance.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v1.pdf","comment":"10 pages, Pre-print"},{"id":"http://arxiv.org/abs/2307.11655v1","updated":"2023-07-21T15:43:32Z","published":"2023-07-21T15:43:32Z","title":"Bandits with Deterministically Evolving States","summary":" We propose a model for learning with bandit feedback while accounting for\ndeterministically evolving and unobservable states that we call Bandits with\nDeterministically Evolving States. The workhorse applications of our model are\nlearning for recommendation systems and learning for online ads. In both cases,\nthe reward that the algorithm obtains at each round is a function of the\nshort-term reward of the action chosen and how ``healthy'' the system is (i.e.,\nas measured by its state). For example, in recommendation systems, the reward\nthat the platform obtains from a user's engagement with a particular type of\ncontent depends not only on the inherent features of the specific content, but\nalso on how the user's preferences have evolved as a result of interacting with\nother types of content on the platform. Our general model accounts for the\ndifferent rate $\\lambda \\in [0,1]$ at which the state evolves (e.g., how fast a\nuser's preferences shift as a result of previous content consumption) and\nencompasses standard multi-armed bandits as a special case. The goal of the\nalgorithm is to minimize a notion of regret against the best-fixed sequence of\narms pulled. We analyze online learning algorithms for any possible\nparametrization of the evolution rate $\\lambda$. Specifically, the regret rates\nobtained are: for $\\lambda \\in [0, 1/T^2]$: $\\widetilde O(\\sqrt{KT})$; for\n$\\lambda = T^{-a/b}$ with $b < a < 2b$: $\\widetilde O (T^{b/a})$; for $\\lambda\n\\in (1/T, 1 - 1/\\sqrt{T}): \\widetilde O (K^{1/3}T^{2/3})$; and for $\\lambda \\in\n[1 - 1/\\sqrt{T}, 1]: \\widetilde O (K\\sqrt{T})$.\n","authors":["Khashayar Khosravi","Renato Paes Leme","Chara Podimata","Apostolis Tsorvantzis"],"pdf_url":"https://arxiv.org/pdf/2307.11655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09208v3","updated":"2023-07-21T15:27:34Z","published":"2022-05-18T20:34:25Z","title":"Torchhd: An Open Source Python Library to Support Research on\n Hyperdimensional Computing and Vector Symbolic Architectures","summary":" Hyperdimensional computing (HD), also known as vector symbolic architectures\n(VSA), is a framework for computing with distributed representations by\nexploiting properties of random high-dimensional vector spaces. The commitment\nof the scientific community to aggregate and disseminate research in this\nparticularly multidisciplinary area has been fundamental for its advancement.\nJoining these efforts, we present Torchhd, a high-performance open source\nPython library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and\nserves as an efficient foundation for further research and application\ndevelopment. The easy-to-use library builds on top of PyTorch and features\nstate-of-the-art HD/VSA functionality, clear documentation, and implementation\nexamples from well-known publications. Comparing publicly available code with\ntheir corresponding Torchhd implementation shows that experiments can run up to\n100x faster. Torchhd is available at:\nhttps://github.com/hyperdimensional-computing/torchhd.\n","authors":["Mike Heddes","Igor Nunes","Pere Vergés","Denis Kleyko","Danny Abraham","Tony Givargis","Alexandru Nicolau","Alexander Veidenbaum"],"pdf_url":"https://arxiv.org/pdf/2205.09208v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08647v4","updated":"2023-07-21T14:59:16Z","published":"2023-02-17T01:32:44Z","title":"Multiresolution Graph Transformers and Wavelet Positional Encoding for\n Learning Hierarchical Structures","summary":" Contemporary graph learning algorithms are not well-defined for large\nmolecules since they do not consider the hierarchical interactions among the\natoms, which are essential to determine the molecular properties of\nmacromolecules. In this work, we propose Multiresolution Graph Transformers\n(MGT), the first graph transformer architecture that can learn to represent\nlarge molecules at multiple scales. MGT can learn to produce representations\nfor the atoms and group them into meaningful functional groups or repeating\nunits. We also introduce Wavelet Positional Encoding (WavePE), a new positional\nencoding method that can guarantee localization in both spectral and spatial\ndomains. Our proposed model achieves competitive results on two macromolecule\ndatasets consisting of polymers and peptides, and one drug-like molecule\ndataset. Importantly, our model outperforms other state-of-the-art methods and\nachieves chemical accuracy in estimating molecular properties (e.g., GAP, HOMO\nand LUMO) calculated by Density Functional Theory (DFT) in the polymers\ndataset. Furthermore, the visualizations, including clustering results on\nmacromolecules and low-dimensional spaces of their representations, demonstrate\nthe capability of our methodology in learning to represent long-range and\nhierarchical structures. Our PyTorch implementation is publicly available at\nhttps://github.com/HySonLab/Multires-Graph-Transformer\n","authors":["Nhat Khang Ngo","Truong Son Hy","Risi Kondor"],"pdf_url":"https://arxiv.org/pdf/2302.08647v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11629v1","updated":"2023-07-21T14:53:12Z","published":"2023-07-21T14:53:12Z","title":"Scalable Multi-agent Skill Discovery based on Kronecker Graphs","summary":" Covering skill (a.k.a., option) discovery has been developed to improve the\nexploration of RL in single-agent scenarios with sparse reward signals, through\nconnecting the most distant states in the embedding space provided by the\nFiedler vector of the state transition graph. Given that joint state space\ngrows exponentially with the number of agents in multi-agent systems, existing\nresearches still relying on single-agent option discovery either become\nprohibitive or fail to directly discover joint options that improve the\nconnectivity of the joint state space. In this paper, we show how to directly\ncompute multi-agent options with collaborative exploratory behaviors while\nstill enjoying the ease of decomposition. Our key idea is to approximate the\njoint state space as a Kronecker graph, based on which we can directly estimate\nits Fiedler vector using the Laplacian spectrum of individual agents'\ntransition graphs. Further, considering that directly computing the Laplacian\nspectrum is intractable for tasks with infinite-scale state spaces, we further\npropose a deep learning extension of our method by estimating eigenfunctions\nthrough NN-based representation learning techniques. The evaluation on\nmulti-agent tasks built with simulators like Mujoco, shows that the proposed\nalgorithm can successfully identify multi-agent options, and significantly\noutperforms the state-of-the-art. Codes are available at:\nhttps://github.itap.purdue.edu/Clan-labs/Scalable_MAOD_via_KP.\n","authors":["Jiayu Chen","Jingdi Chen","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2307.11629v1.pdf","comment":"Accepted to NeurIPS 2022. arXiv admin note: substantial text overlap\n with arXiv:2201.08227"},{"id":"http://arxiv.org/abs/2307.11620v1","updated":"2023-07-21T14:37:54Z","published":"2023-07-21T14:37:54Z","title":"Offline Multi-Agent Reinforcement Learning with Implicit Global-to-Local\n Value Regularization","summary":" Offline reinforcement learning (RL) has received considerable attention in\nrecent years due to its attractive capability of learning policies from offline\ndatasets without environmental interactions. Despite some success in the\nsingle-agent setting, offline multi-agent RL (MARL) remains to be a challenge.\nThe large joint state-action space and the coupled multi-agent behaviors pose\nextra complexities for offline policy optimization. Most existing offline MARL\nstudies simply apply offline data-related regularizations on individual agents,\nwithout fully considering the multi-agent system at the global level. In this\nwork, we present OMIGA, a new offline m ulti-agent RL algorithm with implicit\nglobal-to-local v alue regularization. OMIGA provides a principled framework to\nconvert global-level value regularization into equivalent implicit local value\nregularizations and simultaneously enables in-sample learning, thus elegantly\nbridging multi-agent value decomposition and policy learning with offline\nregularizations. Based on comprehensive experiments on the offline multi-agent\nMuJoCo and StarCraft II micro-management tasks, we show that OMIGA achieves\nsuperior performance over the state-of-the-art offline MARL methods in almost\nall tasks.\n","authors":["Xiangsen Wang","Haoran Xu","Yinan Zheng","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.11620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11617v1","updated":"2023-07-21T14:36:40Z","published":"2023-07-21T14:36:40Z","title":"Robust Fully-Asynchronous Methods for Distributed Training over General\n Architecture","summary":" Perfect synchronization in distributed machine learning problems is\ninefficient and even impossible due to the existence of latency, package losses\nand stragglers. We propose a Robust Fully-Asynchronous Stochastic Gradient\nTracking method (R-FAST), where each device performs local computation and\ncommunication at its own pace without any form of synchronization. Different\nfrom existing asynchronous distributed algorithms, R-FAST can eliminate the\nimpact of data heterogeneity across devices and allow for packet losses by\nemploying a robust gradient tracking strategy that relies on properly designed\nauxiliary variables for tracking and buffering the overall gradient vector.\nMore importantly, the proposed method utilizes two spanning-tree graphs for\ncommunication so long as both share at least one common root, enabling flexible\ndesigns in communication architectures. We show that R-FAST converges in\nexpectation to a neighborhood of the optimum with a geometric rate for smooth\nand strongly convex objectives; and to a stationary point with a sublinear rate\nfor general non-convex settings. Extensive experiments demonstrate that R-FAST\nruns 1.5-2 times faster than synchronous benchmark algorithms, such as\nRing-AllReduce and D-PSGD, while still achieving comparable accuracy, and\noutperforms existing asynchronous SOTA algorithms, such as AD-PSGD and OSGP,\nespecially in the presence of stragglers.\n","authors":["Zehan Zhu","Ye Tian","Yan Huang","Jinming Xu","Shibo He"],"pdf_url":"https://arxiv.org/pdf/2307.11617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11609v1","updated":"2023-07-21T14:25:22Z","published":"2023-07-21T14:25:22Z","title":"Persistent Ballistic Entanglement Spreading with Optimal Control in\n Quantum Spin Chains","summary":" Entanglement propagation provides a key routine to understand quantum\nmany-body dynamics in and out of equilibrium. In this work, we uncover that the\n``variational entanglement-enhancing'' field (VEEF) robustly induces a\npersistent ballistic spreading of entanglement in quantum spin chains. The VEEF\nis time dependent, and is optimally controlled to maximize the bipartite\nentanglement entropy (EE) of the final state. Such a linear growth persists\ntill the EE reaches the genuine saturation $\\tilde{S} = - \\log_{2}\n2^{-\\frac{N}{2}}=\\frac{N}{2}$ with $N$ the total number of spins. The EE\nsatisfies $S(t) = v t$ for the time $t \\leq \\frac{N}{2v}$, with $v$ the\nvelocity. These results are in sharp contrast with the behaviors without VEEF,\nwhere the EE generally approaches a sub-saturation known as the Page value\n$\\tilde{S}_{P} =\\tilde{S} - \\frac{1}{2\\ln{2}}$ in the long-time limit, and the\nentanglement growth deviates from being linear before the Page value is\nreached. The dependence between the velocity and interactions is explored, with\n$v \\simeq 2.76$, $4.98$, and $5.75$ for the spin chains with Ising, XY, and\nHeisenberg interactions, respectively. We further show that the nonlinear\ngrowth of EE emerges with the presence of long-range interactions.\n","authors":["Ying Lu","Pei Shi","Xiao-Han Wang","Jie Hu","Shi-Ju Ran"],"pdf_url":"https://arxiv.org/pdf/2307.11609v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.11608v1","updated":"2023-07-21T14:25:06Z","published":"2023-07-21T14:25:06Z","title":"Learning minimal representations of stochastic processes with\n variational autoencoders","summary":" Stochastic processes have found numerous applications in science, as they are\nbroadly used to model a variety of natural phenomena. Due to their intrinsic\nrandomness and uncertainty, they are however difficult to characterize. Here,\nwe introduce an unsupervised machine learning approach to determine the minimal\nset of parameters required to effectively describe the dynamics of a stochastic\nprocess. Our method builds upon an extended $\\beta$-variational autoencoder\narchitecture. By means of simulated datasets corresponding to paradigmatic\ndiffusion models, we showcase its effectiveness in extracting the minimal\nrelevant parameters that accurately describe these dynamics. Furthermore, the\nmethod enables the generation of new trajectories that faithfully replicate the\nexpected stochastic behavior. Overall, our approach enables for the autonomous\ndiscovery of unknown parameters describing stochastic processes, hence\nenhancing our comprehension of complex phenomena across various fields.\n","authors":["Gabriel Fernández-Fernández","Carlo Manzo","Maciej Lewenstein","Alexandre Dauphin","Gorka Muñoz-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.11608v1.pdf","comment":"9 pages, 5 figures, 1 table. Code available at\n https://github.com/GabrielFernandezFernandez/SPIVAE"},{"id":"http://arxiv.org/abs/2307.11607v1","updated":"2023-07-21T14:23:41Z","published":"2023-07-21T14:23:41Z","title":"Finding Optimal Diverse Feature Sets with Alternative Feature Selection","summary":" Feature selection is popular for obtaining small, interpretable, yet highly\naccurate prediction models. Conventional feature-selection methods typically\nyield one feature set only, which might not suffice in some scenarios. For\nexample, users might be interested in finding alternative feature sets with\nsimilar prediction quality, offering different explanations of the data. In\nthis article, we introduce alternative feature selection and formalize it as an\noptimization problem. In particular, we define alternatives via constraints and\nenable users to control the number and dissimilarity of alternatives. Next, we\nanalyze the complexity of this optimization problem and show NP-hardness.\nFurther, we discuss how to integrate conventional feature-selection methods as\nobjectives. Finally, we evaluate alternative feature selection with 30\nclassification datasets. We observe that alternative feature sets may indeed\nhave high prediction quality, and we analyze several factors influencing this\noutcome.\n","authors":["Jakob Bach"],"pdf_url":"https://arxiv.org/pdf/2307.11607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00211v2","updated":"2023-07-21T13:57:09Z","published":"2022-12-01T01:40:03Z","title":"A Unified Algorithm Framework for Unsupervised Discovery of Skills based\n on Determinantal Point Process","summary":" Learning rich skills through temporal abstractions without supervision of\nexternal rewards is at the frontier of Reinforcement Learning research.\nExisting works mainly fall into two distinctive categories: variational and\nLaplacian-based skill (a.k.a., option) discovery. The former maximizes the\ndiversity of the discovered options through a mutual information loss but\noverlooks coverage of the state space, while the latter focuses on improving\nthe coverage of options by increasing connectivity during exploration, but does\nnot consider diversity. In this paper, we propose a unified framework that\nquantifies diversity and coverage through a novel use of the Determinantal\nPoint Process (DPP) and enables unsupervised option discovery explicitly\noptimizing both objectives. Specifically, we define the DPP kernel matrix with\nthe Laplacian spectrum of the state transition graph and use the expected mode\nnumber in the trajectories as the objective to capture and enhance both\ndiversity and coverage of the learned options. The proposed option discovery\nalgorithm is extensively evaluated using challenging tasks built with Mujoco\nand Atari, demonstrating that our proposed algorithm substantially outperforms\nSOTA baselines from both diversity- and coverage-driven categories. The codes\nare available at https://github.com/LucasCJYSDL/ODPP.\n","authors":["Jiayu Chen","Vaneet Aggarwal","Tian Lan"],"pdf_url":"https://arxiv.org/pdf/2212.00211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11588v1","updated":"2023-07-21T13:51:45Z","published":"2023-07-21T13:51:45Z","title":"Transferability of Convolutional Neural Networks in Stationary Learning\n Tasks","summary":" Recent advances in hardware and big data acquisition have accelerated the\ndevelopment of deep learning techniques. For an extended period of time,\nincreasing the model complexity has led to performance improvements for various\ntasks. However, this trend is becoming unsustainable and there is a need for\nalternative, computationally lighter methods. In this paper, we introduce a\nnovel framework for efficient training of convolutional neural networks (CNNs)\nfor large-scale spatial problems. To accomplish this we investigate the\nproperties of CNNs for tasks where the underlying signals are stationary. We\nshow that a CNN trained on small windows of such signals achieves a nearly\nperformance on much larger windows without retraining. This claim is supported\nby our theoretical analysis, which provides a bound on the performance\ndegradation. Additionally, we conduct thorough experimental analysis on two\ntasks: multi-target tracking and mobile infrastructure on demand. Our results\nshow that the CNN is able to tackle problems with many hundreds of agents after\nbeing trained with fewer than ten. Thus, CNN architectures provide solutions to\nthese problems at previously computationally intractable scales.\n","authors":["Damian Owerko","Charilaos I. Kanatsoulis","Jennifer Bondarchuk","Donald J. Bucci Jr","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2307.11588v1.pdf","comment":"14 pages, 7 figures, for associated code see\n https://github.com/damowerko/mtt"},{"id":"http://arxiv.org/abs/2307.11584v1","updated":"2023-07-21T13:48:11Z","published":"2023-07-21T13:48:11Z","title":"A Change of Heart: Improving Speech Emotion Recognition through\n Speech-to-Text Modality Conversion","summary":" Speech Emotion Recognition (SER) is a challenging task. In this paper, we\nintroduce a modality conversion concept aimed at enhancing emotion recognition\nperformance on the MELD dataset. We assess our approach through two\nexperiments: first, a method named Modality-Conversion that employs automatic\nspeech recognition (ASR) systems, followed by a text classifier; second, we\nassume perfect ASR output and investigate the impact of modality conversion on\nSER, this method is called Modality-Conversion++. Our findings indicate that\nthe first method yields substantial results, while the second method\noutperforms state-of-the-art (SOTA) speech-based approaches in terms of SER\nweighted-F1 (WF1) score on the MELD dataset. This research highlights the\npotential of modality conversion for tasks that can be conducted in alternative\nmodalities.\n","authors":["Zeinab Sadat Taghavi","Ali Satvaty","Hossein Sameti"],"pdf_url":"https://arxiv.org/pdf/2307.11584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.08227v3","updated":"2023-07-21T13:42:59Z","published":"2022-01-20T15:33:08Z","title":"Learning Multi-agent Skills for Tabular Reinforcement Learning using\n Factor Graphs","summary":" Covering skill (a.k.a., option) discovery has been developed to improve the\nexploration of reinforcement learning in single-agent scenarios with sparse\nreward signals, through connecting the most distant states in the embedding\nspace provided by the Fiedler vector of the state transition graph. However,\nthese option discovery methods cannot be directly extended to multi-agent\nscenarios, since the joint state space grows exponentially with the number of\nagents in the system. Thus, existing researches on adopting options in\nmulti-agent scenarios still rely on single-agent option discovery and fail to\ndirectly discover the joint options that can improve the connectivity of the\njoint state space of agents. In this paper, we show that it is indeed possible\nto directly compute multi-agent options with collaborative exploratory\nbehaviors among the agents, while still enjoying the ease of decomposition. Our\nkey idea is to approximate the joint state space as a Kronecker graph -- the\nKronecker product of individual agents' state transition graphs, based on which\nwe can directly estimate the Fiedler vector of the joint state space using the\nLaplacian spectrum of individual agents' transition graphs. This decomposition\nenables us to efficiently construct multi-agent joint options by encouraging\nagents to connect the sub-goal joint states which are corresponding to the\nminimum or maximum values of the estimated joint Fiedler vector. The evaluation\nbased on multi-agent collaborative tasks shows that the proposed algorithm can\nsuccessfully identify multi-agent options, and significantly outperforms prior\nworks using single-agent options or no options, in terms of both faster\nexploration and higher cumulative rewards.\n","authors":["Jiayu Chen","Jingdi Chen","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2201.08227v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03269v2","updated":"2023-07-21T13:27:13Z","published":"2022-10-07T00:40:59Z","title":"Multi-agent Deep Covering Skill Discovery","summary":" The use of skills (a.k.a., options) can greatly accelerate exploration in\nreinforcement learning, especially when only sparse reward signals are\navailable. While option discovery methods have been proposed for individual\nagents, in multi-agent reinforcement learning settings, discovering\ncollaborative options that can coordinate the behavior of multiple agents and\nencourage them to visit the under-explored regions of their joint state space\nhas not been considered. In this case, we propose Multi-agent Deep Covering\nOption Discovery, which constructs the multi-agent options through minimizing\nthe expected cover time of the multiple agents' joint state space. Also, we\npropose a novel framework to adopt the multi-agent options in the MARL process.\nIn practice, a multi-agent task can usually be divided into some sub-tasks,\neach of which can be completed by a sub-group of the agents. Therefore, our\nalgorithm framework first leverages an attention mechanism to find\ncollaborative agent sub-groups that would benefit most from coordinated\nactions. Then, a hierarchical algorithm, namely HA-MSAC, is developed to learn\nthe multi-agent options for each sub-group to complete their sub-tasks first,\nand then to integrate them through a high-level policy as the solution of the\nwhole task. This hierarchical option construction allows our framework to\nstrike a balance between scalability and effective collaboration among the\nagents. The evaluation based on multi-agent collaborative tasks shows that the\nproposed algorithm can effectively capture the agent interactions with the\nattention mechanism, successfully identify multi-agent options, and\nsignificantly outperforms prior works using single-agent options or no options,\nin terms of both faster exploration and higher task rewards.\n","authors":["Jiayu Chen","Marina Haliem","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2210.03269v2.pdf","comment":"This paper was presented in part at the ICML Reinforcement Learning\n for Real Life Workshop, July 2021"},{"id":"http://arxiv.org/abs/2305.18453v2","updated":"2023-07-21T13:26:21Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Medical Image Synthesis","summary":" The demand for artificial intelligence (AI) in healthcare is rapidly\nincreasing. However, significant challenges arise from data scarcity and\nprivacy concerns, particularly in medical imaging. While existing generative\nmodels have achieved success in image synthesis and image-to-image translation\ntasks, there remains a gap in the generation of 3D semantic medical images. To\naddress this gap, we introduce Med-DDPM, a diffusion model specifically\ndesigned for semantic 3D medical image synthesis, effectively tackling data\nscarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation\nof semantic conditioning, enabling precise control during the image generation\nprocess. Our model outperforms Generative Adversarial Networks (GANs) in terms\nof stability and performance, generating diverse and anatomically coherent\nimages with high visual fidelity. Comparative analysis against state-of-the-art\naugmentation techniques demonstrates that Med-DDPM produces comparable results,\nhighlighting its potential as a data augmentation tool for enhancing model\naccuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis\nby delivering high-quality and anatomically coherent images. Furthermore, the\nintegration of semantic conditioning with Med-DDPM holds promise for image\nanonymization in the field of biomedical imaging, showcasing the capabilities\nof the model in addressing challenges related to data scarcity and privacy\nconcerns.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11565v1","updated":"2023-07-21T13:17:22Z","published":"2023-07-21T13:17:22Z","title":"FMT: Removing Backdoor Feature Maps via Feature Map Testing in Deep\n Neural Networks","summary":" Deep neural networks have been widely used in many critical applications,\nsuch as autonomous vehicles and medical diagnosis. However, their security is\nthreatened by backdoor attack, which is achieved by adding artificial patterns\nto specific training data. Existing defense strategies primarily focus on using\nreverse engineering to reproduce the backdoor trigger generated by attackers\nand subsequently repair the DNN model by adding the trigger into inputs and\nfine-tuning the model with ground-truth labels. However, once the trigger\ngenerated by the attackers is complex and invisible, the defender can not\nsuccessfully reproduce the trigger. Consequently, the DNN model will not be\nrepaired since the trigger is not effectively removed.\n In this work, we propose Feature Map Testing~(FMT). Different from existing\ndefense strategies, which focus on reproducing backdoor triggers, FMT tries to\ndetect the backdoor feature maps, which are trained to extract backdoor\ninformation from the inputs. After detecting these backdoor feature maps, FMT\nwill erase them and then fine-tune the model with a secure subset of training\ndata. Our experiments demonstrate that, compared to existing defense\nstrategies, FMT can effectively reduce the Attack Success Rate (ASR) even\nagainst the most complex and invisible attack triggers. Second, unlike\nconventional defense methods that tend to exhibit low Robust Accuracy (i.e.,\nthe model's accuracy on the poisoned data), FMT achieves higher RA, indicating\nits superiority in maintaining model performance while mitigating the effects\nof backdoor attacks~(e.g., FMT obtains 87.40\\% RA in CIFAR10). Third, compared\nto existing feature map pruning techniques, FMT can cover more backdoor feature\nmaps~(e.g., FMT removes 83.33\\% of backdoor feature maps from the model in the\nCIFAR10 \\& BadNet scenario).\n","authors":["Dong Huang","Qingwen Bu","Yahao Qing","Yichao Fu","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2307.11565v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2301.09559v2","updated":"2023-07-21T13:13:01Z","published":"2023-01-23T17:20:25Z","title":"SpArX: Sparse Argumentative Explanations for Neural Networks","summary":" Neural networks (NNs) have various applications in AI, but explaining their\ndecisions remains challenging. Existing approaches often focus on explaining\nhow changing individual inputs affects NNs' outputs. However, an explanation\nthat is consistent with the input-output behaviour of an NN is not necessarily\nfaithful to the actual mechanics thereof. In this paper, we exploit\nrelationships between multi-layer perceptrons (MLPs) and quantitative\nargumentation frameworks (QAFs) to create argumentative explanations for the\nmechanics of MLPs. Our SpArX method first sparsifies the MLP while maintaining\nas much of the original structure as possible. It then translates the sparse\nMLP into an equivalent QAF to shed light on the underlying decision process of\nthe MLP, producing global and/or local explanations. We demonstrate\nexperimentally that SpArX can give more faithful explanations than existing\napproaches, while simultaneously providing deeper insights into the actual\nreasoning process of MLPs.\n","authors":["Hamed Ayoobi","Nico Potyka","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2301.09559v2.pdf","comment":"Accepted at the European Conference on Artificial Intelligence (ECAI)\n 2023 Conference"},{"id":"http://arxiv.org/abs/2307.11552v1","updated":"2023-07-21T12:58:03Z","published":"2023-07-21T12:58:03Z","title":"A multi-modal representation of El Niño Southern Oscillation Diversity","summary":" The El Ni\\~no-Southern Oscillation (ENSO) is characterized by alternating\nperiods of warm (El Ni\\~no) and cold (La Ni\\~na) sea surface temperature\nanomalies (SSTA) in the equatorial Pacific. Although El Ni\\~no and La Ni\\~na\nare well-defined climate patterns, no two events are alike. To date, ENSO\ndiversity has been described primarily in terms of the longitudinal location of\npeak SSTA, used to define a bimodal classification of events in Eastern Pacific\n(EP) and Central Pacific (CP) types. Here, we use low-dimensional\nrepresentations of Pacific SSTAs to argue that binary categorical memberships\nare unsuitable to describe ENSO events. Using fuzzy unsupervised clustering, we\nrecover the four known ENSO categories, along with a fifth category: an Extreme\nEl Ni\\~no. We show that Extreme El Ni\\~nos differ both in their intensity and\ntemporal evolution from canonical EP El Ni\\~nos. We also find that CP La\nNi\\~nas, EP El Ni\\~nos, and Extreme El Ni\\~nos contribute the most to\ninterdecadal ENSO variability.\n","authors":["Jakob Schlör","Felix Strnad","Antonietta Capotondi","Bedartha Goswami"],"pdf_url":"https://arxiv.org/pdf/2307.11552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11546v1","updated":"2023-07-21T12:47:28Z","published":"2023-07-21T12:47:28Z","title":"Towards practical reinforcement learning for tokamak magnetic control","summary":" Reinforcement learning (RL) has shown promising results for real-time control\nsystems, including the domain of plasma magnetic control. However, there are\nstill significant drawbacks compared to traditional feedback control approaches\nfor magnetic confinement. In this work, we address key drawbacks of the RL\nmethod; achieving higher control accuracy for desired plasma properties,\nreducing the steady-state error, and decreasing the required time to learn new\ntasks. We build on top of \\cite{degrave2022magnetic}, and present algorithmic\nimprovements to the agent architecture and training procedure. We present\nsimulation results that show up to 65\\% improvement in shape accuracy, achieve\nsubstantial reduction in the long-term bias of the plasma current, and\nadditionally reduce the training time required to learn new tasks by a factor\nof 3 or more. We present new experiments using the upgraded RL-based\ncontrollers on the TCV tokamak, which validate the simulation results achieved,\nand point the way towards routinely achieving accurate discharges using the RL\napproach.\n","authors":["Brendan D. Tracey","Andrea Michi","Yuri Chervonyi","Ian Davies","Cosmin Paduraru","Nevena Lazic","Federico Felici","Timo Ewalds","Craig Donner","Cristian Galperti","Jonas Buchli","Michael Neunert","Andrea Huber","Jonathan Evens","Paula Kurylowicz","Daniel J. Mankowitz","Martin Riedmiller","The TCV Team"],"pdf_url":"https://arxiv.org/pdf/2307.11546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11532v1","updated":"2023-07-21T12:26:42Z","published":"2023-07-21T12:26:42Z","title":"Training Latency Minimization for Model-Splitting Allowed Federated Edge\n Learning","summary":" To alleviate the shortage of computing power faced by clients in training\ndeep neural networks (DNNs) using federated learning (FL), we leverage the edge\ncomputing and split learning to propose a model-splitting allowed FL (SFL)\nframework, with the aim to minimize the training latency without loss of test\naccuracy. Under the synchronized global update setting, the latency to complete\na round of global training is determined by the maximum latency for the clients\nto complete a local training session. Therefore, the training latency\nminimization problem (TLMP) is modelled as a minimizing-maximum problem. To\nsolve this mixed integer nonlinear programming problem, we first propose a\nregression method to fit the quantitative-relationship between the cut-layer\nand other parameters of an AI-model, and thus, transform the TLMP into a\ncontinuous problem. Considering that the two subproblems involved in the TLMP,\nnamely, the cut-layer selection problem for the clients and the computing\nresource allocation problem for the parameter-server are relative independence,\nan alternate-optimization-based algorithm with polynomial time complexity is\ndeveloped to obtain a high-quality solution to the TLMP. Extensive experiments\nare performed on a popular DNN-model EfficientNetV2 using dataset MNIST, and\nthe results verify the validity and improved performance of the proposed SFL\nframework.\n","authors":["Yao Wen","Guopeng Zhang","Kezhi Wang","Kun Yang"],"pdf_url":"https://arxiv.org/pdf/2307.11532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01639v2","updated":"2023-07-21T12:16:41Z","published":"2023-06-02T15:59:47Z","title":"Reduction of finite sampling noise in quantum neural networks","summary":" Quantum neural networks (QNNs) use parameterized quantum circuits with\ndata-dependent inputs and generate outputs through the evaluation of\nexpectation values. Calculating these expectation values necessitates repeated\ncircuit evaluations, thus introducing fundamental finite-sampling noise even on\nerror-free quantum computers. We reduce this noise by introducing the variance\nregularization, a technique for reducing the variance of the expectation value\nduring the quantum model training. This technique requires no additional\ncircuit evaluations if the QNN is properly constructed. Our empirical findings\ndemonstrate the reduced variance speeds up the training and lowers the output\nnoise as well as decreases the number of necessary evaluations of gradient\ncircuits. This regularization method is benchmarked on the regression of\nmultiple functions. We show that in our examples, it lowers the variance by an\norder of magnitude on average and leads to a significantly reduced noise level\nof the QNN. We finally demonstrate QNN training on a real quantum device and\nevaluate the impact of error mitigation. Here, the optimization is feasible\nonly due to the reduced number of necessary shots in the gradient evaluation\nresulting from the reduced variance.\n","authors":["David A. Kreplin","Marco Roth"],"pdf_url":"https://arxiv.org/pdf/2306.01639v2.pdf","comment":"11 pages, 10 figures; refined section 5"},{"id":"http://arxiv.org/abs/2306.07308v3","updated":"2023-07-21T11:52:28Z","published":"2023-06-12T13:48:37Z","title":"Self-Supervised Hyperspectral Inpainting with the Optimisation inspired\n Deep Neural Network Prior","summary":" Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral\nbands, conveying a wealth of spatial and spectral information. However, due to\nthe instrumental errors and the atmospheric changes, the HSI obtained in\npractice are often contaminated by noise and dead pixels(lines), resulting in\nmissing information that may severely compromise the subsequent applications.\nWe introduce here a novel HSI missing pixel prediction algorithm, called Low\nRank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP\nis able to predict missing pixels and bands even when all spectral bands of the\nimage are missing. The proposed LRS-PnP algorithm is further extended to a\nself-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP),\ncalled LRS-PnP-DIP. In a series of experiments with real data, It is shown that\nthe LRS-PnP-DIP either achieves state-of-the-art inpainting performance\ncompared to other learning-based methods, or outperforms them.\n","authors":["Shuo Li","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2306.07308v3.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2303.06067v2","updated":"2023-07-21T11:40:45Z","published":"2023-03-10T16:48:54Z","title":"Modeling Events and Interactions through Temporal Processes -- A Survey","summary":" In real-world scenario, many phenomena produce a collection of events that\noccur in continuous time. Point Processes provide a natural mathematical\nframework for modeling these sequences of events. In this survey, we\ninvestigate probabilistic models for modeling event sequences through temporal\nprocesses. We revise the notion of event modeling and provide the mathematical\nfoundations that characterize the literature on the topic. We define an\nontology to categorize the existing approaches in terms of three families:\nsimple, marked, and spatio-temporal point processes. For each family, we\nsystematically review the existing approaches based based on deep learning.\nFinally, we analyze the scenarios where the proposed techniques can be used for\naddressing prediction and modeling aspects.\n","authors":["Angelica Liguori","Luciano Caroprese","Marco Minici","Bruno Veloso","Francesco Spinnato","Mirco Nanni","Giuseppe Manco","Joao Gama"],"pdf_url":"https://arxiv.org/pdf/2303.06067v2.pdf","comment":"Image replacements"},{"id":"http://arxiv.org/abs/2304.14118v2","updated":"2023-07-21T11:36:40Z","published":"2023-04-27T12:05:34Z","title":"Learning Neural PDE Solvers with Parameter-Guided Channel Attention","summary":" Scientific Machine Learning (SciML) is concerned with the development of\nlearned emulators of physical systems governed by partial differential\nequations (PDE). In application domains such as weather forecasting, molecular\ndynamics, and inverse design, ML-based surrogate models are increasingly used\nto augment or replace inefficient and often non-differentiable numerical\nsimulation algorithms. While a number of ML-based methods for approximating the\nsolutions of PDEs have been proposed in recent years, they typically do not\nadapt to the parameters of the PDEs, making it difficult to generalize to PDE\nparameters not seen during training. We propose a Channel Attention mechanism\nguided by PDE Parameter Embeddings (CAPE) component for neural surrogate models\nand a simple yet effective curriculum learning strategy. The CAPE module can be\ncombined with neural PDE solvers allowing them to adapt to unseen PDE\nparameters. The curriculum learning strategy provides a seamless transition\nbetween teacher-forcing and fully auto-regressive training. We compare CAPE in\nconjunction with the curriculum learning strategy using a popular PDE benchmark\nand obtain consistent and significant improvements over the baseline models.\nThe experiments also show several advantages of CAPE, such as its increased\nability to generalize to unseen PDE parameters without large increases\ninference time and parameter count.\n","authors":["Makoto Takamoto","Francesco Alesiani","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2304.14118v2.pdf","comment":"accepted for publication in ICML2023"},{"id":"http://arxiv.org/abs/2306.00988v2","updated":"2023-07-21T11:27:10Z","published":"2023-06-01T17:59:57Z","title":"Continual Learning for Abdominal Multi-Organ and Tumor Segmentation","summary":" The ability to dynamically extend a model to new data and classes is critical\nfor multiple organ and tumor segmentation. However, due to privacy regulations,\naccessing previous data and annotations can be problematic in the medical\ndomain. This poses a significant barrier to preserving the high segmentation\naccuracy of the old classes when learning from new classes because of the\ncatastrophic forgetting problem. In this paper, we first empirically\ndemonstrate that simply using high-quality pseudo labels can fairly mitigate\nthis problem in the setting of organ segmentation. Furthermore, we put forward\nan innovative architecture designed specifically for continuous organ and tumor\nsegmentation, which incurs minimal computational overhead. Our proposed design\ninvolves replacing the conventional output layer with a suite of lightweight,\nclass-specific heads, thereby offering the flexibility to accommodate newly\nemerging classes. These heads enable independent predictions for newly\nintroduced and previously learned classes, effectively minimizing the impact of\nnew classes on old ones during the course of continual learning. We further\npropose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings\ninto the organ-specific heads. These embeddings encapsulate the semantic\ninformation of each class, informed by extensive image-text co-training. The\nproposed method is evaluated on both in-house and public abdominal CT datasets\nunder organ and tumor segmentation tasks. Empirical results suggest that the\nproposed design improves the segmentation performance of a baseline neural\nnetwork on newly-introduced and previously-learned classes along the learning\ntrajectory.\n","authors":["Yixiao Zhang","Xinyi Li","Huimiao Chen","Alan Yuille","Yaoyao Liu","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.00988v2.pdf","comment":"MICCAI-2023"},{"id":"http://arxiv.org/abs/2307.11503v1","updated":"2023-07-21T11:19:00Z","published":"2023-07-21T11:19:00Z","title":"General regularization in covariate shift adaptation","summary":" Sample reweighting is one of the most widely used methods for correcting the\nerror of least squares learning algorithms in reproducing kernel Hilbert spaces\n(RKHS), that is caused by future data distributions that are different from the\ntraining data distribution. In practical situations, the sample weights are\ndetermined by values of the estimated Radon-Nikod\\'ym derivative, of the future\ndata distribution w.r.t.~the training data distribution. In this work, we\nreview known error bounds for reweighted kernel regression in RKHS and obtain,\nby combination, novel results. We show under weak smoothness conditions, that\nthe amount of samples, needed to achieve the same order of accuracy as in the\nstandard supervised learning without differences in data distributions, is\nsmaller than proven by state-of-the-art analyses.\n","authors":["Duc Hoan Nguyen","Sergei V. Pereverzyev","Werner Zellinger"],"pdf_url":"https://arxiv.org/pdf/2307.11503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11494v1","updated":"2023-07-21T10:56:36Z","published":"2023-07-21T10:56:36Z","title":"Predict, Refine, Synthesize: Self-Guiding Diffusion Models for\n Probabilistic Time Series Forecasting","summary":" Diffusion models have achieved state-of-the-art performance in generative\nmodeling tasks across various domains. Prior works on time series diffusion\nmodels have primarily focused on developing conditional models tailored to\nspecific forecasting or imputation tasks. In this work, we explore the\npotential of task-agnostic, unconditional diffusion models for several time\nseries applications. We propose TSDiff, an unconditionally trained diffusion\nmodel for time series. Our proposed self-guidance mechanism enables\nconditioning TSDiff for downstream tasks during inference, without requiring\nauxiliary networks or altering the training procedure. We demonstrate the\neffectiveness of our method on three different time series tasks: forecasting,\nrefinement, and synthetic data generation. First, we show that TSDiff is\ncompetitive with several task-specific conditional forecasting methods\n(predict). Second, we leverage the learned implicit probability density of\nTSDiff to iteratively refine the predictions of base forecasters with reduced\ncomputational overhead over reverse diffusion (refine). Notably, the generative\nperformance of the model remains intact -- downstream forecasters trained on\nsynthetic samples from TSDiff outperform forecasters that are trained on\nsamples from other state-of-the-art generative time series models, occasionally\neven outperforming models trained on real data (synthesize).\n","authors":["Marcel Kollovieh","Abdul Fatir Ansari","Michael Bohlke-Schneider","Jasper Zschiegner","Hao Wang","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11487v1","updated":"2023-07-21T10:45:08Z","published":"2023-07-21T10:45:08Z","title":"A New Deep State-Space Analysis Framework for Patient Latent State\n Estimation and Classification from EHR Time Series Data","summary":" Many diseases, including cancer and chronic conditions, require extended\ntreatment periods and long-term strategies. Machine learning and AI research\nfocusing on electronic health records (EHRs) have emerged to address this need.\nEffective treatment strategies involve more than capturing sequential changes\nin patient test values. It requires an explainable and clinically interpretable\nmodel by capturing the patient's internal state over time.\n In this study, we propose the \"deep state-space analysis framework,\" using\ntime-series unsupervised learning of EHRs with a deep state-space model. This\nframework enables learning, visualizing, and clustering of temporal changes in\npatient latent states related to disease progression.\n We evaluated our framework using time-series laboratory data from 12,695\ncancer patients. By estimating latent states, we successfully discover latent\nstates related to prognosis. By visualization and cluster analysis, the\ntemporal transition of patient status and test items during state transitions\ncharacteristic of each anticancer drug were identified. Our framework surpasses\nexisting methods in capturing interpretable latent space. It can be expected to\nenhance our comprehension of disease progression from EHRs, aiding treatment\nadjustments and prognostic determinations.\n","authors":["Aya Nakamura","Ryosuke Kojima","Yuji Okamoto","Eiichiro Uchino","Yohei Mineharu","Yohei Harada","Mayumi Kamada","Manabu Muto","Motoko Yanagita","Yasushi Okuno"],"pdf_url":"https://arxiv.org/pdf/2307.11487v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.06092v3","updated":"2023-07-21T10:04:23Z","published":"2023-07-12T11:35:37Z","title":"Quantitative CLTs in Deep Neural Networks","summary":" We study the distribution of a fully connected neural network with random\nGaussian weights and biases in which the hidden layer widths are proportional\nto a large constant $n$. Under mild assumptions on the non-linearity, we obtain\nquantitative bounds on normal approximations valid at large but finite $n$ and\nany fixed network depth. Our theorems show both for the finite-dimensional\ndistributions and the entire process, that the distance between a random fully\nconnected network (and its derivatives) to the corresponding infinite width\nGaussian process scales like $n^{-\\gamma}$ for $\\gamma>0$, with the exponent\ndepending on the metric used to measure discrepancy. Our bounds are strictly\nstronger in terms of their dependence on network width than any previously\navailable in the literature; in the one-dimensional case, we also prove that\nthey are optimal, i.e., we establish matching lower bounds.\n","authors":["Stefano Favaro","Boris Hanin","Domenico Marinucci","Ivan Nourdin","Giovanni Peccati"],"pdf_url":"https://arxiv.org/pdf/2307.06092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11465v1","updated":"2023-07-21T10:01:55Z","published":"2023-07-21T10:01:55Z","title":"A Deep Learning Approach for Overall Survival Analysis with Missing\n Values","summary":" One of the most challenging fields where Artificial Intelligence (AI) can be\napplied is lung cancer research, specifically non-small cell lung cancer\n(NSCLC). In particular, overall survival (OS) is a vital indicator of patient\nstatus, helping to identify subgroups with diverse survival probabilities,\nenabling tailored treatment and improved OS rates. In this analysis, there are\ntwo challenges to take into account. First, few studies effectively exploit the\ninformation available from each patient, leveraging both uncensored (i.e.,\ndead) and censored (i.e., survivors) patients, considering also the death\ntimes. Second, the handling of incomplete data is a common issue in the medical\nfield. This problem is typically tackled through the use of imputation methods.\nOur objective is to present an AI model able to overcome these limits,\neffectively learning from both censored and uncensored patients and their\navailable features, for the prediction of OS for NSCLC patients. We present a\nnovel approach to survival analysis in the context of NSCLC, which exploits the\nstrengths of the transformer architecture accounting for only available\nfeatures without requiring any imputation strategy. By making use of ad-hoc\nlosses for OS, it accounts for both censored and uncensored patients,\nconsidering risks over time. We evaluated the results over a period of 6 years\nusing different time granularities obtaining a Ct-index, a time-dependent\nvariant of the C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1\nyear and 2 years, respectively, outperforming all state-of-the-art methods\nregardless of the imputation method used.\n","authors":["Camillo Maria Caruso","Valerio Guarrasi","Sara Ramella","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2307.11465v1.pdf","comment":"19 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.11462v1","updated":"2023-07-21T09:55:44Z","published":"2023-07-21T09:55:44Z","title":"Improve Long-term Memory Learning Through Rescaling the Error Temporally","summary":" This paper studies the error metric selection for long-term memory learning\nin sequence modelling. We examine the bias towards short-term memory in\ncommonly used errors, including mean absolute/squared error. Our findings show\nthat all temporally positive-weighted errors are biased towards short-term\nmemory in learning linear functionals. To reduce this bias and improve\nlong-term memory learning, we propose the use of a temporally rescaled error.\nIn addition to reducing the bias towards short-term memory, this approach can\nalso alleviate the vanishing gradient issue. We conduct numerical experiments\non different long-memory tasks and sequence models to validate our claims.\nNumerical results confirm the importance of appropriate temporally rescaled\nerror for effective long-term memory learning. To the best of our knowledge,\nthis is the first work that quantitatively analyzes different errors' memory\nbias towards short-term memory in sequence modelling.\n","authors":["Shida Wang","Zhanglu Yan"],"pdf_url":"https://arxiv.org/pdf/2307.11462v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.10617v2","updated":"2023-07-21T09:49:15Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":" In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v2.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.05825v2","updated":"2023-07-21T09:47:20Z","published":"2023-07-11T22:16:13Z","title":"Bayesian taut splines for estimating the number of modes","summary":" The number of modes in a probability density function is representative of\nthe model's complexity and can also be viewed as the number of existing\nsubpopulations. Despite its relevance, little research has been devoted to its\nestimation. Focusing on the univariate setting, we propose a novel approach\ntargeting prediction accuracy inspired by some overlooked aspects of the\nproblem. We argue for the need for structure in the solutions, the subjective\nand uncertain nature of modes, and the convenience of a holistic view blending\nglobal and local density properties. Our method builds upon a combination of\nflexible kernel estimators and parsimonious compositional splines. Feature\nexploration, model selection and mode testing are implemented in the Bayesian\ninference paradigm, providing soft solutions and allowing to incorporate expert\njudgement in the process. The usefulness of our proposal is illustrated through\na case study in sports analytics, showcasing multiple companion visualisation\ntools. A thorough simulation study demonstrates that traditional\nmodality-driven approaches paradoxically struggle to provide accurate results.\nIn this context, our method emerges as a top-tier alternative offering\ninnovative solutions for analysts.\n","authors":["José E. Chacón","Javier Fernández Serrano"],"pdf_url":"https://arxiv.org/pdf/2307.05825v2.pdf","comment":"20 pages, 8 figures (manuscript) + 19 pages, 16 figures\n (supplementary material)"},{"id":"http://arxiv.org/abs/2307.10926v2","updated":"2023-07-21T09:47:01Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n segmentation","summary":" Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquaux","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.02953v2","updated":"2023-07-21T09:26:06Z","published":"2023-07-06T12:39:06Z","title":"SegNetr: Rethinking the local-global interactions and skip connections\n in U-shaped networks","summary":" Recently, U-shaped networks have dominated the field of medical image\nsegmentation due to their simple and easily tuned structure. However, existing\nU-shaped segmentation networks: 1) mostly focus on designing complex\nself-attention modules to compensate for the lack of long-term dependence based\non convolution operation, which increases the overall number of parameters and\ncomputational complexity of the network; 2) simply fuse the features of encoder\nand decoder, ignoring the connection between their spatial locations. In this\npaper, we rethink the above problem and build a lightweight medical image\nsegmentation network, called SegNetr. Specifically, we introduce a novel\nSegNetr block that can perform local-global interactions dynamically at any\nstage and with only linear complexity. At the same time, we design a general\ninformation retention skip connection (IRSC) to preserve the spatial location\ninformation of encoder features and achieve accurate fusion with the decoder\nfeatures. We validate the effectiveness of SegNetr on four mainstream medical\nimage segmentation datasets, with 59\\% and 76\\% fewer parameters and GFLOPs\nthan vanilla U-Net, while achieving segmentation performance comparable to\nstate-of-the-art methods. Notably, the components proposed in this paper can be\napplied to other U-shaped networks to improve their segmentation performance.\n","authors":["Junlong Cheng","Chengrui Gao","Fengjie Wang","Min Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.02953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04246v2","updated":"2023-07-21T09:15:42Z","published":"2023-02-08T18:26:10Z","title":"Shortcut Detection with Variational Autoencoders","summary":" For real-world applications of machine learning (ML), it is essential that\nmodels make predictions based on well-generalizing features rather than\nspurious correlations in the data. The identification of such spurious\ncorrelations, also known as shortcuts, is a challenging problem and has so far\nbeen scarcely addressed. In this work, we present a novel approach to detect\nshortcuts in image and audio datasets by leveraging variational autoencoders\n(VAEs). The disentanglement of features in the latent space of VAEs allows us\nto discover feature-target correlations in datasets and semi-automatically\nevaluate them for ML shortcuts. We demonstrate the applicability of our method\non several real-world datasets and identify shortcuts that have not been\ndiscovered before.\n","authors":["Nicolas M. Müller","Simon Roschmann","Shahbaz Khan","Philip Sperl","Konstantin Böttinger"],"pdf_url":"https://arxiv.org/pdf/2302.04246v2.pdf","comment":"Accepted at the ICML 2023 Workshop on Spurious Correlations,\n Invariance and Stability"},{"id":"http://arxiv.org/abs/2303.09975v4","updated":"2023-07-21T09:05:53Z","published":"2023-03-17T13:48:17Z","title":"MedNeXt: Transformer-driven Scaling of ConvNets for Medical Image\n Segmentation","summary":" There has been exploding interest in embracing Transformer-based\narchitectures for medical image segmentation. However, the lack of large-scale\nannotated medical datasets make achieving performances equivalent to those in\nnatural images challenging. Convolutional networks, in contrast, have higher\ninductive biases and consequently, are easily trainable to high performance.\nRecently, the ConvNeXt architecture attempted to modernize the standard ConvNet\nby mirroring Transformer blocks. In this work, we improve upon this to design a\nmodernized and scalable convolutional architecture customized to challenges of\ndata-scarce medical settings. We introduce MedNeXt, a Transformer-inspired\nlarge kernel segmentation network which introduces - 1) A fully ConvNeXt 3D\nEncoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up\nand downsampling blocks to preserve semantic richness across scales, 3) A novel\ntechnique to iteratively increase kernel sizes by upsampling small kernel\nnetworks, to prevent performance saturation on limited medical data, 4)\nCompound scaling at multiple levels (depth, width, kernel size) of MedNeXt.\nThis leads to state-of-the-art performance on 4 tasks on CT and MRI modalities\nand varying dataset sizes, representing a modernized deep architecture for\nmedical image segmentation. Our code is made publicly available at:\nhttps://github.com/MIC-DKFZ/MedNeXt.\n","authors":["Saikat Roy","Gregor Koehler","Constantin Ulrich","Michael Baumgartner","Jens Petersen","Fabian Isensee","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.09975v4.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11436v1","updated":"2023-07-21T08:57:16Z","published":"2023-07-21T08:57:16Z","title":"Neural Operators for Delay-Compensating Control of Hyperbolic PIDEs","summary":" The recently introduced DeepONet operator-learning framework for PDE control\nis extended from the results for basic hyperbolic and parabolic PDEs to an\nadvanced hyperbolic class that involves delays on both the state and the system\noutput or input. The PDE backstepping design produces gain functions that are\noutputs of a nonlinear operator, mapping functions on a spatial domain into\nfunctions on a spatial domain, and where this gain-generating operator's inputs\nare the PDE's coefficients. The operator is approximated with a DeepONet neural\nnetwork to a degree of accuracy that is provably arbitrarily tight. Once we\nproduce this approximation-theoretic result in infinite dimension, with it we\nestablish stability in closed loop under feedback that employs approximate\ngains. In addition to supplying such results under full-state feedback, we also\ndevelop DeepONet-approximated observers and output-feedback laws and prove\ntheir own stabilizing properties under neural operator approximations. With\nnumerical simulations we illustrate the theoretical results and quantify the\nnumerical effort savings, which are of two orders of magnitude, thanks to\nreplacing the numerical PDE solving with the DeepONet.\n","authors":["Jie Qi","Jing Zhang","Miroslav Krstic"],"pdf_url":"https://arxiv.org/pdf/2307.11436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11434v1","updated":"2023-07-21T08:55:23Z","published":"2023-07-21T08:55:23Z","title":"Batching for Green AI -- An Exploratory Study on Inference","summary":" The batch size is an essential parameter to tune during the development of\nnew neural networks. Amongst other quality indicators, it has a large degree of\ninfluence on the model's accuracy, generalisability, training times and\nparallelisability. This fact is generally known and commonly studied. However,\nduring the application phase of a deep learning model, when the model is\nutilised by an end-user for inference, we find that there is a disregard for\nthe potential benefits of introducing a batch size. In this study, we examine\nthe effect of input batching on the energy consumption and response times of\nfive fully-trained neural networks for computer vision that were considered\nstate-of-the-art at the time of their publication. The results suggest that\nbatching has a significant effect on both of these metrics. Furthermore, we\npresent a timeline of the energy efficiency and accuracy of neural networks\nover the past decade. We find that in general, energy consumption rises at a\nmuch steeper pace than accuracy and question the necessity of this evolution.\nAdditionally, we highlight one particular network, ShuffleNetV2(2018), that\nachieved a competitive performance for its time while maintaining a much lower\nenergy consumption. Nevertheless, we highlight that the results are model\ndependent.\n","authors":["Tim Yarally","Luís Cruz","Daniel Feitosa","June Sallou","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2307.11434v1.pdf","comment":"8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series\n on Software Engineering and Advanced Applications (SEAA) 2023"},{"id":"http://arxiv.org/abs/2307.11432v1","updated":"2023-07-21T08:52:08Z","published":"2023-07-21T08:52:08Z","title":"An Analysis of Multi-Agent Reinforcement Learning for Decentralized\n Inventory Control Systems","summary":" Most solutions to the inventory management problem assume a centralization of\ninformation that is incompatible with organisational constraints in real supply\nchain networks. The inventory management problem is a well-known planning\nproblem in operations research, concerned with finding the optimal re-order\npolicy for nodes in a supply chain. While many centralized solutions to the\nproblem exist, they are not applicable to real-world supply chains made up of\nindependent entities. The problem can however be naturally decomposed into\nsub-problems, each associated with an independent entity, turning it into a\nmulti-agent system. Therefore, a decentralized data-driven solution to\ninventory management problems using multi-agent reinforcement learning is\nproposed where each entity is controlled by an agent. Three multi-agent\nvariations of the proximal policy optimization algorithm are investigated\nthrough simulations of different supply chain networks and levels of\nuncertainty. The centralized training decentralized execution framework is\ndeployed, which relies on offline centralization during simulation-based policy\nidentification, but enables decentralization when the policies are deployed\nonline to the real system. Results show that using multi-agent proximal policy\noptimization with a centralized critic leads to performance very close to that\nof a centralized data-driven solution and outperforms a distributed model-based\nsolution in most cases while respecting the information constraints of the\nsystem.\n","authors":["Marwan Mousa","Damien van de Berg","Niki Kotecha","Ehecatl Antonio del Rio-Chanona","Max Mowbray"],"pdf_url":"https://arxiv.org/pdf/2307.11432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15342v2","updated":"2023-07-21T08:51:09Z","published":"2023-05-24T16:55:49Z","title":"Is Your Model \"MADD\"? A Novel Metric to Evaluate Algorithmic Fairness\n for Predictive Student Models","summary":" Predictive student models are increasingly used in learning environments due\nto their ability to enhance educational outcomes and support stakeholders in\nmaking informed decisions. However, predictive models can be biased and produce\nunfair outcomes, leading to potential discrimination against some students and\npossible harmful long-term implications. This has prompted research on fairness\nmetrics meant to capture and quantify such biases. Nonetheless, so far,\nexisting fairness metrics used in education are predictive\nperformance-oriented, focusing on assessing biased outcomes across groups of\nstudents, without considering the behaviors of the models nor the severity of\nthe biases in the outcomes. Therefore, we propose a novel metric, the Model\nAbsolute Density Distance (MADD), to analyze models' discriminatory behaviors\nindependently from their predictive performance. We also provide a\ncomplementary visualization-based analysis to enable fine-grained human\nassessment of how the models discriminate between groups of students. We\nevaluate our approach on the common task of predicting student success in\nonline courses, using several common predictive classification models on an\nopen educational dataset. We also compare our metric to the only predictive\nperformance-oriented fairness metric developed in education, ABROCA. Results on\nthis dataset show that: (1) fair predictive performance does not guarantee fair\nmodels' behaviors and thus fair outcomes, (2) there is no direct relationship\nbetween data bias and predictive performance bias nor discriminatory behaviors\nbias, and (3) trained on the same data, models exhibit different discriminatory\nbehaviors, according to different sensitive features too. We thus recommend\nusing the MADD on models that show satisfying predictive performance, to gain a\nfiner-grained understanding on how they behave and to refine models selection\nand their usage.\n","authors":["Mélina Verger","Sébastien Lallé","François Bouchet","Vanda Luengo"],"pdf_url":"https://arxiv.org/pdf/2305.15342v2.pdf","comment":"12 pages, conference"},{"id":"http://arxiv.org/abs/2307.11423v1","updated":"2023-07-21T08:33:55Z","published":"2023-07-21T08:33:55Z","title":"Attention to Entropic Communication","summary":" The concept of attention, numerical weights that emphasize the importance of\nparticular data, has proven to be very relevant in artificial intelligence.\nRelative entropy (RE, aka Kullback-Leibler divergence) plays a central role in\ncommunication theory. Here we combine these concepts, attention and RE. RE\nguides optimal encoding of messages in bandwidth-limited communication as well\nas optimal message decoding via the maximum entropy principle (MEP). In the\ncoding scenario, RE can be derived from four requirements, namely being\nanalytical, local, proper, and calibrated. Weighted RE, used for attention\nsteering in communications, turns out to be improper. To see how proper\nattention communication can emerge, we analyze a scenario of a message sender\nwho wants to ensure that the receiver of the message can perform well-informed\nactions. If the receiver decodes the message using the MEP, the sender only\nneeds to know the receiver's utility function to inform optimally, but not the\nreceiver's initial knowledge state. In case only the curvature of the utility\nfunction maxima are known, it becomes desirable to accurately communicate an\nattention function, in this case a by this curvature weighted and re-normalized\nprobability function. Entropic attention communication is here proposed as the\ndesired generalization of entropic communication that permits weighting while\nbeing proper, thereby aiding the design of optimal communication protocols in\ntechnical applications and helping to understand human communication. For\nexample, our analysis shows how to derive the level of cooperation expected\nunder misaligned interests of otherwise honest communication partners.\n","authors":["Torsten Enßlin","Carolin Weidinger","Philipp Frank"],"pdf_url":"https://arxiv.org/pdf/2307.11423v1.pdf","comment":"23 pages, 4 figures, submitted"},{"id":"http://arxiv.org/abs/2306.09087v2","updated":"2023-07-21T08:32:35Z","published":"2023-06-15T12:33:39Z","title":"Deep learning based Meta-modeling for Multi-objective Technology\n Optimization of Electrical Machines","summary":" Optimization of rotating electrical machines is both time- and\ncomputationally expensive. Because of the different parametrization, design\noptimization is commonly executed separately for each machine technology. In\nthis paper, we present the application of a variational auto-encoder (VAE) to\noptimize two different machine technologies simultaneously, namely an\nasynchronous machine and a permanent magnet synchronous machine. After\ntraining, we employ a deep neural network and a decoder as meta-models to\npredict global key performance indicators (KPIs) and generate associated new\ndesigns, respectively, through unified latent space in the optimization loop.\nNumerical results demonstrate concurrent parametric multi-objective technology\noptimization in the high-dimensional design space. The VAE-based approach is\nquantitatively compared to a classical deep learning-based direct approach for\nKPIs prediction.\n","authors":["Vivek Parekh","Dominik Flore","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2306.09087v2.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2306.09260v2","updated":"2023-07-21T08:18:51Z","published":"2023-06-07T14:22:41Z","title":"IsoEx: an explainable unsupervised approach to process event logs cyber\n investigation","summary":" 39 seconds. That is the timelapse between two consecutive cyber attacks as of\n2023. Meaning that by the time you are done reading this abstract, about 1 or 2\nadditional cyber attacks would have occurred somewhere in the world. In this\ncontext of highly increased frequency of cyber threats, Security Operation\nCenters (SOC) and Computer Emergency Response Teams (CERT) can be overwhelmed.\nIn order to relieve the cybersecurity teams in their investigative effort and\nhelp them focus on more added-value tasks, machine learning approaches and\nmethods started to emerge. This paper introduces a novel method, IsoEx, for\ndetecting anomalous and potentially problematic command lines during the\ninvestigation of contaminated devices. IsoEx is built around a set of features\nthat leverages the log structure of the command line, as well as its\nparent/child relationship, to achieve a greater accuracy than traditional\nmethods. To detect anomalies, IsoEx resorts to an unsupervised anomaly\ndetection technique that is both highly sensitive and lightweight. A key\ncontribution of the paper is its emphasis on interpretability, achieved through\nthe features themselves and the application of eXplainable Artificial\nIntelligence (XAI) techniques and visualizations. This is critical to ensure\nthe adoption of the method by SOC and CERT teams, as the paper argues that the\ncurrent literature on machine learning for log investigation has not adequately\naddressed the issue of explainability. This method was proven efficient in a\nreal-life environment as it was built to support a company\\'s SOC and CERT\n","authors":["Pierre Lavieille","Ismail Alaoui Hassani Atlas"],"pdf_url":"https://arxiv.org/pdf/2306.09260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11408v1","updated":"2023-07-21T08:07:16Z","published":"2023-07-21T08:07:16Z","title":"Direct and inverse modeling of soft robots by learning a condensed FEM\n model","summary":" The Finite Element Method (FEM) is a powerful modeling tool for predicting\nthe behavior of soft robots. However, its use for control can be difficult for\nnon-specialists of numerical computation: it requires an optimization of the\ncomputation to make it real-time. In this paper, we propose a learning-based\napproach to obtain a compact but sufficiently rich mechanical representation.\nOur choice is based on nonlinear compliance data in the actuator/effector space\nprovided by a condensation of the FEM model. We demonstrate that this compact\nmodel can be learned with a reasonable amount of data and, at the same time, be\nvery efficient in terms of modeling, since we can deduce the direct and inverse\nkinematics of the robot. We also show how to couple some models learned\nindividually in particular on an example of a gripper composed of two soft\nfingers. Other results are shown by comparing the inverse model derived from\nthe full FEM model and the one from the compact learned version. This work\nopens new perspectives, namely for the embedded control of soft robots, but\nalso for their design. These perspectives are also discussed in the paper.\n","authors":["Etienne Ménager","Tanguy Navez","Olivier Goury","Christian Duriez"],"pdf_url":"https://arxiv.org/pdf/2307.11408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09382v3","updated":"2023-07-21T07:59:06Z","published":"2023-06-15T12:59:04Z","title":"Sound Demixing Challenge 2023 Music Demixing Track Technical Report:\n TFC-TDF-UNet v3","summary":" In this report, we present our award-winning solutions for the Music Demixing\nTrack of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a\ntime-efficient music source separation model that achieves state-of-the-art\nresults on the MUSDB benchmark. We then give full details regarding our\nsolutions for each Leaderboard, including a loss masking approach for\nnoise-robust training. Code for reproducing model training and final\nsubmissions is available at github.com/kuielab/sdx23.\n","authors":["Minseok Kim","Jun Hyung Lee","Soonyoung Jung"],"pdf_url":"https://arxiv.org/pdf/2306.09382v3.pdf","comment":"5 pages, 4 tables"},{"id":"http://arxiv.org/abs/2304.04250v2","updated":"2023-07-21T07:39:58Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":" Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v2.pdf","comment":"SIGIR-2023 Camera Ready"},{"id":"http://arxiv.org/abs/2307.11397v1","updated":"2023-07-21T07:29:38Z","published":"2023-07-21T07:29:38Z","title":"Probabilistic Modeling of Inter- and Intra-observer Variability in\n Medical Image Segmentation","summary":" Medical image segmentation is a challenging task, particularly due to inter-\nand intra-observer variability, even between medical experts. In this paper, we\npropose a novel model, called Probabilistic Inter-Observer and iNtra-Observer\nvariation NetwOrk (Pionono). It captures the labeling behavior of each rater\nwith a multidimensional probability distribution and integrates this\ninformation with the feature maps of the image to produce probabilistic\nsegmentation predictions. The model is optimized by variational inference and\ncan be trained end-to-end. It outperforms state-of-the-art models such as\nSTAPLE, Probabilistic U-Net, and models based on confusion matrices.\nAdditionally, Pionono predicts multiple coherent segmentation maps that mimic\nthe rater's expert opinion, which provides additional valuable information for\nthe diagnostic process. Experiments on real-world cancer segmentation datasets\ndemonstrate the high accuracy and efficiency of Pionono, making it a powerful\ntool for medical image analysis.\n","authors":["Arne Schmidt","Pablo Morales-Álvarez","Rafael Molina"],"pdf_url":"https://arxiv.org/pdf/2307.11397v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.06146v2","updated":"2023-07-21T06:34:54Z","published":"2023-03-10T18:59:33Z","title":"StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces","summary":" Recent advances in face manipulation using StyleGAN have produced impressive\nresults. However, StyleGAN is inherently limited to cropped aligned faces at a\nfixed image resolution it is pre-trained on. In this paper, we propose a simple\nand effective solution to this limitation by using dilated convolutions to\nrescale the receptive fields of shallow layers in StyleGAN, without altering\nany model parameters. This allows fixed-size small features at shallow layers\nto be extended into larger ones that can accommodate variable resolutions,\nmaking them more robust in characterizing unaligned faces. To enable real face\ninversion and manipulation, we introduce a corresponding encoder that provides\nthe first-layer feature of the extended StyleGAN in addition to the latent\nstyle code. We validate the effectiveness of our method using unaligned face\ninputs of various resolutions in a diverse set of face manipulation tasks,\nincluding facial attribute editing, super-resolution, sketch/mask-to-face\ntranslation, and face toonification.\n","authors":["Shuai Yang","Liming Jiang","Ziwei Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2303.06146v2.pdf","comment":"ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX\n Project page: https://www.mmlab-ntu.com/project/styleganex/"},{"id":"http://arxiv.org/abs/2307.11379v1","updated":"2023-07-21T06:34:41Z","published":"2023-07-21T06:34:41Z","title":"Towards Better Fairness-Utility Trade-off: A Comprehensive\n Measurement-Based Reinforcement Learning Framework","summary":" Machine learning is widely used to make decisions with societal impact such\nas bank loan approving, criminal sentencing, and resume filtering. How to\nensure its fairness while maintaining utility is a challenging but crucial\nissue. Fairness is a complex and context-dependent concept with over 70\ndifferent measurement metrics. Since existing regulations are often vague in\nterms of which metric to use and different organizations may prefer different\nfairness metrics, it is important to have means of improving fairness\ncomprehensively. Existing mitigation techniques often target at one specific\nfairness metric and have limitations in improving multiple notions of fairness\nsimultaneously. In this work, we propose CFU (Comprehensive Fairness-Utility),\na reinforcement learning-based framework, to efficiently improve the\nfairness-utility trade-off in machine learning classifiers. A comprehensive\nmeasurement that can simultaneously consider multiple fairness notions as well\nas utility is established, and new metrics are proposed based on an in-depth\nanalysis of the relationship between different fairness metrics. The reward\nfunction of CFU is constructed with comprehensive measurement and new metrics.\nWe conduct extensive experiments to evaluate CFU on 6 tasks, 3 machine learning\nmodels, and 15 fairness-utility measurements. The results demonstrate that CFU\ncan improve the classifier on multiple fairness metrics without sacrificing its\nutility. It outperforms all state-of-the-art techniques and has witnessed a\n37.5% improvement on average.\n","authors":["Simiao Zhang","Jitao Bai","Menghong Guan","Yihao Huang","Yueling Zhang","Jun Sun","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2307.11379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.07493v3","updated":"2023-07-21T06:28:40Z","published":"2022-05-16T07:53:42Z","title":"Multi-scale Attention Flow for Probabilistic Time Series Forecasting","summary":" The probability prediction of multivariate time series is a notoriously\nchallenging but practical task. On the one hand, the challenge is how to\neffectively capture the cross-series correlations between interacting time\nseries, to achieve accurate distribution modeling. On the other hand, we should\nconsider how to capture the contextual information within time series more\naccurately to model multivariate temporal dynamics of time series. In this\nwork, we proposed a novel non-autoregressive deep learning model, called\nMulti-scale Attention Normalizing Flow(MANF), where we integrate multi-scale\nattention and relative position information and the multivariate data\ndistribution is represented by the conditioned normalizing flow. Additionally,\ncompared with autoregressive modeling methods, our model avoids the influence\nof cumulative error and does not increase the time complexity. Extensive\nexperiments demonstrate that our model achieves state-of-the-art performance on\nmany popular multivariate datasets.\n","authors":["Shibo Feng","Chunyan Miao","Ke Xu","Jiaxiang Wu","Pengcheng Wu","Yang Zhang","Peilin Zhao"],"pdf_url":"https://arxiv.org/pdf/2205.07493v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11375v1","updated":"2023-07-21T06:17:09Z","published":"2023-07-21T06:17:09Z","title":"LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent\n Space","summary":" Data Augmentation (DA) is a technique to increase the quantity and diversity\nof the training data, and by that alleviate overfitting and improve\ngeneralisation. However, standard DA produces synthetic data for augmentation\nwith limited diversity. Generative Adversarial Networks (GANs) may unlock\nadditional information in a dataset by generating synthetic samples having the\nappearance of real images. However, these models struggle to simultaneously\naddress three key requirements: fidelity and high-quality samples; diversity\nand mode coverage; and fast sampling. Indeed, GANs generate high-quality\nsamples rapidly, but have poor mode coverage, limiting their adoption in DA\napplications. We propose LatentAugment, a DA strategy that overcomes the low\ndiversity of GANs, opening up for use in DA applications. Without external\nsupervision, LatentAugment modifies latent vectors and moves them into latent\nspace regions to maximise the synthetic images' diversity and fidelity. It is\nalso agnostic to the dataset and the downstream task. A wide set of experiments\nshows that LatentAugment improves the generalisation of a deep model\ntranslating from MRI-to-CT beating both standard DA as well GAN-based sampling.\nMoreover, still in comparison with GAN-based sampling, LatentAugment synthetic\nsamples show superior mode coverage and diversity. Code is available at:\nhttps://github.com/ltronchin/LatentAugment.\n","authors":["Lorenzo Tronchin","Minh H. Vu","Paolo Soda","Tommy Löfstedt"],"pdf_url":"https://arxiv.org/pdf/2307.11375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11373v1","updated":"2023-07-21T06:12:39Z","published":"2023-07-21T06:12:39Z","title":"Diverse Offline Imitation via Fenchel Duality","summary":" There has been significant recent progress in the area of unsupervised skill\ndiscovery, with various works proposing mutual information based objectives, as\na source of intrinsic motivation. Prior works predominantly focused on\ndesigning algorithms that require online access to the environment. In\ncontrast, we develop an \\textit{offline} skill discovery algorithm. Our problem\nformulation considers the maximization of a mutual information objective\nconstrained by a KL-divergence. More precisely, the constraints ensure that the\nstate occupancy of each skill remains close to the state occupancy of an\nexpert, within the support of an offline dataset with good state-action\ncoverage. Our main contribution is to connect Fenchel duality, reinforcement\nlearning and unsupervised skill discovery, and to give a simple offline\nalgorithm for learning diverse skills that are aligned with an expert.\n","authors":["Marin Vlastelica","Pavel Kolev","Jin Cheng","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2307.11373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11371v1","updated":"2023-07-21T06:03:43Z","published":"2023-07-21T06:03:43Z","title":"Random Separating Hyperplane Theorem and Learning Polytopes","summary":" The Separating Hyperplane theorem is a fundamental result in Convex Geometry\nwith myriad applications. Our first result, Random Separating Hyperplane\nTheorem (RSH), is a strengthening of this for polytopes. $\\rsh$ asserts that if\nthe distance between $a$ and a polytope $K$ with $k$ vertices and unit diameter\nin $\\Re^d$ is at least $\\delta$, where $\\delta$ is a fixed constant in $(0,1)$,\nthen a randomly chosen hyperplane separates $a$ and $K$ with probability at\nleast $1/poly(k)$ and margin at least $\\Omega \\left(\\delta/\\sqrt{d} \\right)$.\nAn immediate consequence of our result is the first near optimal bound on the\nerror increase in the reduction from a Separation oracle to an Optimization\noracle over a polytope.\n RSH has algorithmic applications in learning polytopes. We consider a\nfundamental problem, denoted the ``Hausdorff problem'', of learning a unit\ndiameter polytope $K$ within Hausdorff distance $\\delta$, given an optimization\noracle for $K$. Using RSH, we show that with polynomially many random queries\nto the optimization oracle, $K$ can be approximated within error $O(\\delta)$.\nTo our knowledge this is the first provable algorithm for the Hausdorff\nProblem. Building on this result, we show that if the vertices of $K$ are\nwell-separated, then an optimization oracle can be used to generate a list of\npoints, each within Hausdorff distance $O(\\delta)$ of $K$, with the property\nthat the list contains a point close to each vertex of $K$. Further, we show\nhow to prune this list to generate a (unique) approximation to each vertex of\nthe polytope. We prove that in many latent variable settings, e.g., topic\nmodeling, LDA, optimization oracles do exist provided we project to a suitable\nSVD subspace. Thus, our work yields the first efficient algorithm for finding\napproximations to the vertices of the latent polytope under the\nwell-separatedness assumption.\n","authors":["Chiranjib Bhattacharyya","Ravindran Kannan","Amit Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.11371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11357v1","updated":"2023-07-21T05:17:21Z","published":"2023-07-21T05:17:21Z","title":"Bridging the Reality Gap of Reinforcement Learning based Traffic Signal\n Control using Domain Randomization and Meta Learning","summary":" Reinforcement Learning (RL) has been widely explored in Traffic Signal\nControl (TSC) applications, however, still no such system has been deployed in\npractice. A key barrier to progress in this area is the reality gap, the\ndiscrepancy that results from differences between simulation models and their\nreal-world equivalents. In this paper, we address this challenge by first\npresenting a comprehensive analysis of potential simulation parameters that\ncontribute to this reality gap. We then also examine two promising strategies\nthat can bridge this gap: Domain Randomization (DR) and Model-Agnostic\nMeta-Learning (MAML). Both strategies were trained with a traffic simulation\nmodel of an intersection. In addition, the model was embedded in LemgoRL, a\nframework that integrates realistic, safety-critical requirements into the\ncontrol system. Subsequently, we evaluated the performance of the two methods\non a separate model of the same intersection that was developed with a\ndifferent traffic simulator. In this way, we mimic the reality gap. Our\nexperimental results show that both DR and MAML outperform a state-of-the-art\nRL algorithm, therefore highlighting their potential to mitigate the reality\ngap in RLbased TSC systems.\n","authors":["Arthur Müller","Matthia Sabatelli"],"pdf_url":"https://arxiv.org/pdf/2307.11357v1.pdf","comment":"Paper was accepted by the ITSC 2023 (26th IEEE International\n Conference on Intelligent Transportation Systems)"},{"id":"http://arxiv.org/abs/2307.09484v2","updated":"2023-07-21T05:13:55Z","published":"2023-06-06T12:45:15Z","title":"MolFM: A Multimodal Molecular Foundation Model","summary":" Molecular knowledge resides within three different modalities of information\nsources: molecular structures, biomedical documents, and knowledge bases.\nEffective incorporation of molecular knowledge from these modalities holds\nparamount significance in facilitating biomedical research. However, existing\nmultimodal molecular foundation models exhibit limitations in capturing\nintricate connections between molecular structures and texts, and more\nimportantly, none of them attempt to leverage a wealth of molecular expertise\nderived from knowledge graphs. In this study, we introduce MolFM, a multimodal\nmolecular foundation model designed to facilitate joint representation learning\nfrom molecular structures, biomedical texts, and knowledge graphs. We propose\ncross-modal attention between atoms of molecular structures, neighbors of\nmolecule entities and semantically related texts to facilitate cross-modal\ncomprehension. We provide theoretical analysis that our cross-modal\npre-training captures local and global molecular knowledge by minimizing the\ndistance in the feature space between different modalities of the same\nmolecule, as well as molecules sharing similar structures or functions. MolFM\nachieves state-of-the-art performance on various downstream tasks. On\ncross-modal retrieval, MolFM outperforms existing models with 12.13% and 5.04%\nabsolute gains under the zero-shot and fine-tuning settings, respectively.\nFurthermore, qualitative analysis showcases MolFM's implicit ability to provide\ngrounding from molecular substructures and knowledge graphs. Code and models\nare available on https://github.com/BioFM/OpenBioMed.\n","authors":["Yizhen Luo","Kai Yang","Massimo Hong","Xing Yi Liu","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2307.09484v2.pdf","comment":"31 pages, 15 figures, and 15 tables"},{"id":"http://arxiv.org/abs/2307.11353v1","updated":"2023-07-21T05:05:55Z","published":"2023-07-21T05:05:55Z","title":"What can a Single Attention Layer Learn? A Study Through the Random\n Features Lens","summary":" Attention layers -- which map a sequence of inputs to a sequence of outputs\n-- are core building blocks of the Transformer architecture which has achieved\nsignificant breakthroughs in modern artificial intelligence. This paper\npresents a rigorous theoretical study on the learning and generalization of a\nsingle multi-head attention layer, with a sequence of key vectors and a\nseparate query vector as input. We consider the random feature setting where\nthe attention layer has a large number of heads, with randomly sampled frozen\nquery and key matrices, and trainable value matrices. We show that such a\nrandom-feature attention layer can express a broad class of target functions\nthat are permutation invariant to the key vectors. We further provide\nquantitative excess risk bounds for learning these target functions from finite\nsamples, using random feature attention with finitely many heads.\n Our results feature several implications unique to the attention structure\ncompared with existing random features theory for neural networks, such as (1)\nAdvantages in the sample complexity over standard two-layer random-feature\nnetworks; (2) Concrete and natural classes of functions that can be learned\nefficiently by a random-feature attention layer; and (3) The effect of the\nsampling distribution of the query-key weight matrix (the product of the query\nand key matrix), where Gaussian random weights with a non-zero mean result in\nbetter sample complexities over the zero-mean counterpart for learning certain\nnatural target functions. Experiments on simulated data corroborate our\ntheoretical findings and further illustrate the interplay between the sample\nsize and the complexity of the target function.\n","authors":["Hengyu Fu","Tianyu Guo","Yu Bai","Song Mei"],"pdf_url":"https://arxiv.org/pdf/2307.11353v1.pdf","comment":"41pages, 5 figures"},{"id":"http://arxiv.org/abs/2106.06134v4","updated":"2023-07-21T05:02:21Z","published":"2021-06-11T02:44:00Z","title":"Is Homophily a Necessity for Graph Neural Networks?","summary":" Graph neural networks (GNNs) have shown great prowess in learning\nrepresentations suitable for numerous graph-based machine learning tasks. When\napplied to semi-supervised node classification, GNNs are widely believed to\nwork well due to the homophily assumption (\"like attracts like\"), and fail to\ngeneralize to heterophilous graphs where dissimilar nodes connect. Recent works\ndesign new architectures to overcome such heterophily-related limitations,\nciting poor baseline performance and new architecture improvements on a few\nheterophilous graph benchmark datasets as evidence for this notion. In our\nexperiments, we empirically find that standard graph convolutional networks\n(GCNs) can actually achieve better performance than such carefully designed\nmethods on some commonly used heterophilous graphs. This motivates us to\nreconsider whether homophily is truly necessary for good GNN performance. We\nfind that this claim is not quite true, and in fact, GCNs can achieve strong\nperformance on heterophilous graphs under certain conditions. Our work\ncarefully characterizes these conditions, and provides supporting theoretical\nunderstanding and empirical observations. Finally, we examine existing\nheterophilous graphs benchmarks and reconcile how the GCN (under)performs on\nthem based on this understanding.\n","authors":["Yao Ma","Xiaorui Liu","Neil Shah","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2106.06134v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11352v1","updated":"2023-07-21T04:59:23Z","published":"2023-07-21T04:59:23Z","title":"Model-based Offline Reinforcement Learning with Count-based Conservatism","summary":" In this paper, we propose a model-based offline reinforcement learning method\nthat integrates count-based conservatism, named $\\texttt{Count-MORL}$. Our\nmethod utilizes the count estimates of state-action pairs to quantify model\nestimation error, marking the first algorithm of demonstrating the efficacy of\ncount-based conservatism in model-based offline deep RL to the best of our\nknowledge. For our proposed method, we first show that the estimation error is\ninversely proportional to the frequency of state-action pairs. Secondly, we\ndemonstrate that the learned policy under the count-based conservative model\noffers near-optimality performance guarantees. Through extensive numerical\nexperiments, we validate that $\\texttt{Count-MORL}$ with hash code\nimplementation significantly outperforms existing offline RL algorithms on the\nD4RL benchmark datasets. The code is accessible at\n$\\href{https://github.com/oh-lab/Count-MORL}{https://github.com/oh-lab/Count-MORL}$.\n","authors":["Byeongchan Kim","Min-hwan Oh"],"pdf_url":"https://arxiv.org/pdf/2307.11352v1.pdf","comment":"Accepted in ICML 2023"},{"id":"http://arxiv.org/abs/2307.11351v1","updated":"2023-07-21T04:55:03Z","published":"2023-07-21T04:55:03Z","title":"Bounded P-values in Parametric Programming-based Selective Inference","summary":" Selective inference (SI) has been actively studied as a promising framework\nfor statistical hypothesis testing for data-driven hypotheses. The basic idea\nof SI is to make inferences conditional on an event that a hypothesis is\nselected. In order to perform SI, this event must be characterized in a\ntraceable form. When selection event is too difficult to characterize,\nadditional conditions are introduced for tractability. This additional\nconditions often causes the loss of power, and this issue is referred to as\nover-conditioning. Parametric programming-based SI (PP-based SI) has been\nproposed as one way to address the over-conditioning issue. The main problem of\nPP-based SI is its high computational cost due to the need to exhaustively\nexplore the data space. In this study, we introduce a procedure to reduce the\ncomputational cost while guaranteeing the desired precision, by proposing a\nmethod to compute the upper and lower bounds of p-values. We also proposed\nthree types of search strategies that efficiently improve these bounds. We\ndemonstrate the effectiveness of the proposed method in hypothesis testing\nproblems for feature selection in linear models and attention region\nidentification in deep neural networks.\n","authors":["Tomohiro Shiraishi","Daiki Miwa","Vo Nguyen Le Duy","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2307.11351v1.pdf","comment":"47pages, 14figures"},{"id":"http://arxiv.org/abs/2302.09738v5","updated":"2023-07-21T04:19:43Z","published":"2023-02-20T03:31:11Z","title":"Simplifying Momentum-based Positive-definite Submanifold Optimization\n with Applications to Deep Learning","summary":" Riemannian submanifold optimization with momentum is computationally\nchallenging because, to ensure that the iterates remain on the submanifold, we\noften need to solve difficult differential equations. Here, we simplify such\ndifficulties for a class of structured symmetric positive-definite matrices\nwith the affine-invariant metric. We do so by proposing a generalized version\nof the Riemannian normal coordinates that dynamically orthonormalizes the\nmetric and locally converts the problem into an unconstrained problem in the\nEuclidean space. We use our approach to simplify existing approaches for\nstructured covariances and develop matrix-inverse-free $2^\\text{nd}$-order\noptimizers for deep learning with low precision by using only matrix\nmultiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL\n","authors":["Wu Lin","Valentin Duruisseaux","Melvin Leok","Frank Nielsen","Mohammad Emtiyaz Khan","Mark Schmidt"],"pdf_url":"https://arxiv.org/pdf/2302.09738v5.pdf","comment":"An updated version of the ICML 2023 paper. Updated the main text and\n added more numerical results for DNNs including a new baseline method and\n improving existing baseline methods"},{"id":"http://arxiv.org/abs/2307.11334v1","updated":"2023-07-21T03:43:07Z","published":"2023-07-21T03:43:07Z","title":"Improving Transferability of Adversarial Examples via Bayesian Attacks","summary":" This paper presents a substantial extension of our work published at ICLR.\nOur ICLR work advocated for enhancing transferability in adversarial examples\nby incorporating a Bayesian formulation into model parameters, which\neffectively emulates the ensemble of infinitely many deep neural networks,\nwhile, in this paper, we introduce a novel extension by incorporating the\nBayesian formulation into the model input as well, enabling the joint\ndiversification of both the model input and model parameters. Our empirical\nfindings demonstrate that: 1) the combination of Bayesian formulations for both\nthe model input and model parameters yields significant improvements in\ntransferability; 2) by introducing advanced approximations of the posterior\ndistribution over the model input, adversarial transferability achieves further\nenhancement, surpassing all state-of-the-arts when attacking without model\nfine-tuning. Moreover, we propose a principled approach to fine-tune model\nparameters in such an extended Bayesian formulation. The derived optimization\nobjective inherently encourages flat minima in the parameter space and input\nspace. Extensive experiments demonstrate that our method achieves a new\nstate-of-the-art on transfer-based attacks, improving the average success rate\non ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with\nour ICLR basic Bayesian method. We will make our code publicly available.\n","authors":["Qizhang Li","Yiwen Guo","Xiaochen Yang","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11333v1","updated":"2023-07-21T03:41:55Z","published":"2023-07-21T03:41:55Z","title":"Demystifying Local and Global Fairness Trade-offs in Federated Learning\n Using Partial Information Decomposition","summary":" In this paper, we present an information-theoretic perspective to group\nfairness trade-offs in federated learning (FL) with respect to sensitive\nattributes, such as gender, race, etc. Existing works mostly focus on either\n\\emph{global fairness} (overall disparity of the model across all clients) or\n\\emph{local fairness} (disparity of the model at each individual client),\nwithout always considering their trade-offs. There is a lack of understanding\nof the interplay between global and local fairness in FL, and if and when one\nimplies the other. To address this gap, we leverage a body of work in\ninformation theory called partial information decomposition (PID) which first\nidentifies three sources of unfairness in FL, namely, \\emph{Unique Disparity},\n\\emph{Redundant Disparity}, and \\emph{Masked Disparity}. Using canonical\nexamples, we demonstrate how these three disparities contribute to global and\nlocal fairness. This decomposition helps us derive fundamental limits and\ntrade-offs between global or local fairness, particularly under data\nheterogeneity, as well as, derive conditions under which one implies the other.\nWe also present experimental results on benchmark datasets to support our\ntheoretical findings. This work offers a more nuanced understanding of the\nsources of disparity in FL that can inform the use of local disparity\nmitigation techniques, and their convergence and effectiveness when deployed in\npractice.\n","authors":["Faisal Hamman","Sanghamitra Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.11333v1.pdf","comment":"Accepted at ICML Workshop on Federated Learning and Analytics in\n Practice"},{"id":"http://arxiv.org/abs/2307.11332v1","updated":"2023-07-21T03:40:53Z","published":"2023-07-21T03:40:53Z","title":"Beyond Convergence: Identifiability of Machine Learning and Deep\n Learning Models","summary":" Machine learning (ML) and deep learning models are extensively used for\nparameter optimization and regression problems. However, not all inverse\nproblems in ML are ``identifiable,'' indicating that model parameters may not\nbe uniquely determined from the available data and the data model's\ninput-output relationship. In this study, we investigate the notion of model\nparameter identifiability through a case study focused on parameter estimation\nfrom motion sensor data. Utilizing a bipedal-spring mass human walk dynamics\nmodel, we generate synthetic data representing diverse gait patterns and\nconditions. Employing a deep neural network, we attempt to estimate\nsubject-wise parameters, including mass, stiffness, and equilibrium leg length.\nThe results show that while certain parameters can be identified from the\nobservation data, others remain unidentifiable, highlighting that\nunidentifiability is an intrinsic limitation of the experimental setup,\nnecessitating a change in data collection and experimental scenarios. Beyond\nthis specific case study, the concept of identifiability has broader\nimplications in ML and deep learning. Addressing unidentifiability requires\nproven identifiable models (with theoretical support), multimodal data fusion\ntechniques, and advancements in model-based machine learning. Understanding and\nresolving unidentifiability challenges will lead to more reliable and accurate\napplications across diverse domains, transcending mere model convergence and\nenhancing the reliability of machine learning models.\n","authors":["Reza Sameni"],"pdf_url":"https://arxiv.org/pdf/2307.11332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.10736v3","updated":"2023-07-21T03:39:05Z","published":"2022-03-21T05:00:54Z","title":"The activity-weight duality in feed forward neural networks: The\n geometric determinants of generalization","summary":" One of the fundamental problems in machine learning is generalization. In\nneural network models with a large number of weights (parameters), many\nsolutions can be found to fit the training data equally well. The key question\nis which solution can describe testing data not in the training set. Here, we\nreport the discovery of an exact duality (equivalence) between changes in\nactivities in a given layer of neurons and changes in weights that connect to\nthe next layer of neurons in a densely connected layer in any feed forward\nneural network. The activity-weight (A-W) duality allows us to map variations\nin inputs (data) to variations of the corresponding dual weights. By using this\nmapping, we show that the generalization loss can be decomposed into a sum of\ncontributions from different eigen-directions of the Hessian matrix of the loss\nfunction at the solution in weight space. The contribution from a given\neigen-direction is the product of two geometric factors (determinants): the\nsharpness of the loss landscape and the standard deviation of the dual weights,\nwhich is found to scale with the weight norm of the solution. Our results\nprovide an unified framework, which we used to reveal how different\nregularization schemes (weight decay, stochastic gradient descent with\ndifferent batch sizes and learning rates, dropout), training data size, and\nlabeling noise affect generalization performance by controlling either one or\nboth of these two geometric determinants for generalization. These insights can\nbe used to guide development of algorithms for finding more generalizable\nsolutions in overparametrized neural networks.\n","authors":["Yu Feng","Yuhai Tu"],"pdf_url":"https://arxiv.org/pdf/2203.10736v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11327v1","updated":"2023-07-21T03:24:55Z","published":"2023-07-21T03:24:55Z","title":"Systematic Adaptation of Communication-focused Machine Learning Models\n from Real to Virtual Environments for Human-Robot Collaboration","summary":" Virtual reality has proved to be useful in applications in several fields\nranging from gaming, medicine, and training to development of interfaces that\nenable human-robot collaboration. It empowers designers to explore applications\noutside of the constraints posed by the real world environment and develop\ninnovative solutions and experiences. Hand gestures recognition which has been\na topic of much research and subsequent commercialization in the real world has\nbeen possible because of the creation of large, labelled datasets. In order to\nutilize the power of natural and intuitive hand gestures in the virtual domain\nfor enabling embodied teleoperation of collaborative robots, similarly large\ndatasets must be created so as to keep the working interface easy to learn and\nflexible enough to add more gestures. Depending on the application, this may be\ncomputationally or economically prohibitive. Thus, the adaptation of trained\ndeep learning models that perform well in the real environment to the virtual\nmay be a solution to this challenge. This paper presents a systematic framework\nfor the real to virtual adaptation using limited size of virtual dataset along\nwith guidelines for creating a curated dataset. Finally, while hand gestures\nhave been considered as the communication mode, the guidelines and\nrecommendations presented are generic. These are applicable to other modes such\nas body poses and facial expressions which have large datasets available in the\nreal domain which must be adapted to the virtual one.\n","authors":["Debasmita Mukherjee","Ritwik Singhai","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2307.11327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11325v1","updated":"2023-07-21T03:23:17Z","published":"2023-07-21T03:23:17Z","title":"Analysis of Elephant Movement in Sub-Saharan Africa: Ecological,\n Climatic, and Conservation Perspectives","summary":" The interaction between elephants and their environment has profound\nimplications for both ecology and conservation strategies. This study presents\nan analytical approach to decipher the intricate patterns of elephant movement\nin Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal\nvariations and rainfall patterns. Despite the complexities surrounding these\ninfluential factors, our analysis provides a holistic view of elephant\nmigratory behavior in the context of the dynamic African landscape. Our\ncomprehensive approach enables us to predict the potential impact of these\necological determinants on elephant migration, a critical step in establishing\ninformed conservation strategies. This projection is particularly crucial given\nthe impacts of global climate change on seasonal and rainfall patterns, which\ncould substantially influence elephant movements in the future. The findings of\nour work aim to not only advance the understanding of movement ecology but also\nfoster a sustainable coexistence of humans and elephants in Sub-Saharan Africa.\nBy predicting potential elephant routes, our work can inform strategies to\nminimize human-elephant conflict, effectively manage land use, and enhance\nanti-poaching efforts. This research underscores the importance of integrating\nmovement ecology and climatic variables for effective wildlife management and\nconservation planning.\n","authors":["Matthew Hines","Gregory Glatzer","Shreya Ghosh","Prasenjit Mitra"],"pdf_url":"https://arxiv.org/pdf/2307.11325v1.pdf","comment":"11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on\n Computing and Sustainable Societies (COMPASS 2023)"},{"id":"http://arxiv.org/abs/2307.11317v1","updated":"2023-07-21T02:57:40Z","published":"2023-07-21T02:57:40Z","title":"XLDA: Linear Discriminant Analysis for Scaling Continual Learning to\n Extreme Classification at the Edge","summary":" Streaming Linear Discriminant Analysis (LDA) while proven in\nClass-incremental Learning deployments at the edge with limited classes (upto\n1000), has not been proven for deployment in extreme classification scenarios.\nIn this paper, we present: (a) XLDA, a framework for Class-IL in edge\ndeployment where LDA classifier is proven to be equivalent to FC layer\nincluding in extreme classification scenarios, and (b) optimizations to enable\nXLDA-based training and inference for edge deployment where there is a\nconstraint on available compute resources. We show up to 42x speed up using a\nbatched training approach and up to 5x inference speedup with nearest neighbor\nsearch on extreme datasets like AliProducts (50k classes) and Google Landmarks\nV2 (81k classes)\n","authors":["Karan Shah","Vishruth Veerendranath","Anushka Hebbar","Raghavendra Bhat"],"pdf_url":"https://arxiv.org/pdf/2307.11317v1.pdf","comment":"Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop"},{"id":"http://arxiv.org/abs/2307.10579v2","updated":"2023-07-21T02:54:25Z","published":"2023-07-20T04:45:59Z","title":"SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning","summary":" SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to\nprotect data privacy in vertical federated learning setting. It is widely used\nin fields such as finance and healthcare due to its interpretability,\neffectiveness, and privacy-preserving capability. However, SecureBoost suffers\nfrom high computational complexity and risk of label leakage. To harness the\nfull potential of SecureBoost, hyperparameters of SecureBoost should be\ncarefully chosen to strike an optimal balance between utility, efficiency, and\nprivacy. Existing methods either set hyperparameters empirically or\nheuristically, which are far from optimal. To fill this gap, we propose a\nConstrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto\noptimal solutions that each solution is a set of hyperparameters achieving\noptimal tradeoff between utility loss, training cost, and privacy leakage. We\ndesign measurements of the three objectives. In particular, the privacy leakage\nis measured using our proposed instance clustering attack. Experimental results\ndemonstrate that the CMOSB yields not only hyperparameters superior to the\nbaseline but also optimal sets of hyperparameters that can support the flexible\nrequirements of FL participants.\n","authors":["Ziyao Ren","Yan Kang","Lixin Fan","Linghua Yang","Tao Fan","Yongxin Tong","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10579v2.pdf","comment":"FL-ICAI'23"},{"id":"http://arxiv.org/abs/2307.11316v1","updated":"2023-07-21T02:51:41Z","published":"2023-07-21T02:51:41Z","title":"Making Pre-trained Language Models both Task-solvers and\n Self-calibrators","summary":" Pre-trained language models (PLMs) serve as backbones for various real-world\nsystems. For high-stake applications, it's equally essential to have reasonable\nconfidence estimations in predictions. While the vanilla confidence scores of\nPLMs can already be effectively utilized, PLMs consistently become\noverconfident in their wrong predictions, which is not desirable in practice.\nPrevious work shows that introducing an extra calibration task can mitigate\nthis issue. The basic idea involves acquiring additional data to train models\nin predicting the confidence of their initial predictions. However, it only\ndemonstrates the feasibility of this kind of method, assuming that there are\nabundant extra available samples for the introduced calibration task. In this\nwork, we consider the practical scenario that we need to effectively utilize\ntraining samples to make PLMs both task-solvers and self-calibrators. Three\nchallenges are presented, including limited training samples, data imbalance,\nand distribution shifts. We first conduct pilot experiments to quantify various\ndecisive factors in the calibration task. Based on the empirical analysis\nresults, we propose a training algorithm LM-TOAST to tackle the challenges.\nExperimental results show that LM-TOAST can effectively utilize the training\ndata to make PLMs have reasonable confidence estimations while maintaining the\noriginal task performance. Further, we consider three downstream applications,\nnamely selective classification, adversarial defense, and model cascading, to\nshow the practical usefulness of LM-TOAST. The code will be made public at\n\\url{https://github.com/Yangyi-Chen/LM-TOAST}.\n","authors":["Yangyi Chen","Xingyao Wang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2307.11316v1.pdf","comment":"Accepted to Findings of ACL 2023"},{"id":"http://arxiv.org/abs/2307.11314v1","updated":"2023-07-21T02:47:03Z","published":"2023-07-21T02:47:03Z","title":"Neuromorphic Online Learning for Spatiotemporal Patterns with a\n Forward-only Timeline","summary":" Spiking neural networks (SNNs) are bio-plausible computing models with high\nenergy efficiency. The temporal dynamics of neurons and synapses enable them to\ndetect temporal patterns and generate sequences. While Backpropagation Through\nTime (BPTT) is traditionally used to train SNNs, it is not suitable for online\nlearning of embedded applications due to its high computation and memory cost\nas well as extended latency. Previous works have proposed online learning\nalgorithms, but they often utilize highly simplified spiking neuron models\nwithout synaptic dynamics and reset feedback, resulting in subpar performance.\nIn this work, we present Spatiotemporal Online Learning for Synaptic Adaptation\n(SOLSA), specifically designed for online learning of SNNs composed of Leaky\nIntegrate and Fire (LIF) neurons with exponentially decayed synapses and soft\nreset. The algorithm not only learns the synaptic weight but also adapts the\ntemporal filters associated to the synapses. Compared to the BPTT algorithm,\nSOLSA has much lower memory requirement and achieves a more balanced temporal\nworkload distribution. Moreover, SOLSA incorporates enhancement techniques such\nas scheduled weight update, early stop training and adaptive synapse filter,\nwhich speed up the convergence and enhance the learning performance. When\ncompared to other non-BPTT based SNN learning, SOLSA demonstrates an average\nlearning accuracy improvement of 14.2%. Furthermore, compared to BPTT, SOLSA\nachieves a 5% higher average learning accuracy with a 72% reduction in memory\ncost.\n","authors":["Zhenhang Zhang","Jingang Jin","Haowen Fang","Qinru Qiu"],"pdf_url":"https://arxiv.org/pdf/2307.11314v1.pdf","comment":"9 pages,8 figures"},{"id":"http://arxiv.org/abs/2303.17555v2","updated":"2023-07-21T02:20:39Z","published":"2023-03-16T21:02:09Z","title":"Factoring the Matrix of Domination: A Critical Review and Reimagination\n of Intersectionality in AI Fairness","summary":" Intersectionality is a critical framework that, through inquiry and praxis,\nallows us to examine how social inequalities persist through domains of\nstructure and discipline. Given AI fairness' raison d'etre of \"fairness\", we\nargue that adopting intersectionality as an analytical framework is pivotal to\neffectively operationalizing fairness. Through a critical review of how\nintersectionality is discussed in 30 papers from the AI fairness literature, we\ndeductively and inductively: 1) map how intersectionality tenets operate within\nthe AI fairness paradigm and 2) uncover gaps between the conceptualization and\noperationalization of intersectionality. We find that researchers\noverwhelmingly reduce intersectionality to optimizing for fairness metrics over\ndemographic subgroups. They also fail to discuss their social context and when\nmentioning power, they mostly situate it only within the AI pipeline. We: 3)\noutline and assess the implications of these gaps for critical inquiry and\npraxis, and 4) provide actionable recommendations for AI fairness researchers\nto engage with intersectionality in their work by grounding it in AI\nepistemology.\n","authors":["Anaelia Ovalle","Arjun Subramonian","Vagrant Gautam","Gilbert Gee","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2303.17555v2.pdf","comment":"To appear at AIES 2023"},{"id":"http://arxiv.org/abs/2302.04973v2","updated":"2023-07-21T01:40:31Z","published":"2023-02-09T23:25:28Z","title":"Invariant Slot Attention: Object Discovery with Slot-Centric Reference\n Frames","summary":" Automatically discovering composable abstractions from raw perceptual data is\na long-standing challenge in machine learning. Recent slot-based neural\nnetworks that learn about objects in a self-supervised manner have made\nexciting progress in this direction. However, they typically fall short at\nadequately capturing spatial symmetries present in the visual world, which\nleads to sample inefficiency, such as when entangling object appearance and\npose. In this paper, we present a simple yet highly effective method for\nincorporating spatial symmetries via slot-centric reference frames. We\nincorporate equivariance to per-object pose transformations into the attention\nand generation mechanism of Slot Attention by translating, scaling, and\nrotating position encodings. These changes result in little computational\noverhead, are easy to implement, and can result in large gains in terms of data\nefficiency and overall improvements to object discovery. We evaluate our method\non a wide range of synthetic object discovery benchmarks namely CLEVR,\nTetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising\nimprovements on the challenging real-world Waymo Open dataset.\n","authors":["Ondrej Biza","Sjoerd van Steenkiste","Mehdi S. M. Sajjadi","Gamaleldin F. Elsayed","Aravindh Mahendran","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2302.04973v2.pdf","comment":"Accepted at ICML 2023. Project page: https://invariantsa.github.io/"},{"id":"http://arxiv.org/abs/2307.11289v1","updated":"2023-07-21T01:18:02Z","published":"2023-07-21T01:18:02Z","title":"PI-VEGAN: Physics Informed Variational Embedding Generative Adversarial\n Networks for Stochastic Differential Equations","summary":" We present a new category of physics-informed neural networks called physics\ninformed variational embedding generative adversarial network (PI-VEGAN), that\neffectively tackles the forward, inverse, and mixed problems of stochastic\ndifferential equations. In these scenarios, the governing equations are known,\nbut only a limited number of sensor measurements of the system parameters are\navailable. We integrate the governing physical laws into PI-VEGAN with\nautomatic differentiation, while introducing a variational encoder for\napproximating the latent variables of the actual distribution of the\nmeasurements. These latent variables are integrated into the generator to\nfacilitate accurate learning of the characteristics of the stochastic partial\nequations. Our model consists of three components, namely the encoder,\ngenerator, and discriminator, each of which is updated alternatively employing\nthe stochastic gradient descent algorithm. We evaluate the effectiveness of\nPI-VEGAN in addressing forward, inverse, and mixed problems that require the\nconcurrent calculation of system parameters and solutions. Numerical results\ndemonstrate that the proposed method achieves satisfactory stability and\naccuracy in comparison with the previous physics-informed generative\nadversarial network (PI-WGAN).\n","authors":["Ruisong Gao","Yufeng Wang","Min Yang","Chuanjun Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11289v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2307.11288v1","updated":"2023-07-21T01:17:31Z","published":"2023-07-21T01:17:31Z","title":"Kernelized Offline Contextual Dueling Bandits","summary":" Preference-based feedback is important for many applications where direct\nevaluation of a reward function is not feasible. A notable recent example\narises in reinforcement learning from human feedback on large language models.\nFor many of these applications, the cost of acquiring the human feedback can be\nsubstantial or even prohibitive. In this work, we take advantage of the fact\nthat often the agent can choose contexts at which to obtain human feedback in\norder to most efficiently identify a good policy, and introduce the offline\ncontextual dueling bandit setting. We give an upper-confidence-bound style\nalgorithm for this setting and prove a regret bound. We also give empirical\nconfirmation that this method outperforms a similar strategy that uses\nuniformly sampled contexts.\n","authors":["Viraj Mehta","Ojash Neopane","Vikramjeet Das","Sen Lin","Jeff Schneider","Willie Neiswanger"],"pdf_url":"https://arxiv.org/pdf/2307.11288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09702v5","updated":"2023-07-21T01:07:19Z","published":"2022-05-19T17:11:45Z","title":"Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency\n Analysis","summary":" Graph neural networks (GNNs) are among the most powerful tools in deep\nlearning. They routinely solve complex problems on unstructured networks, such\nas node classification, graph classification, or link prediction, with high\naccuracy. However, both inference and training of GNNs are complex, and they\nuniquely combine the features of irregular graph processing with dense and\nregular computations. This complexity makes it very challenging to execute GNNs\nefficiently on modern massively parallel architectures. To alleviate this, we\nfirst design a taxonomy of parallelism in GNNs, considering data and model\nparallelism, and different forms of pipelining. Then, we use this taxonomy to\ninvestigate the amount of parallelism in numerous GNN models, GNN-driven\nmachine learning tasks, software frameworks, or hardware accelerators. We use\nthe work-depth model, and we also assess communication volume and\nsynchronization. We specifically focus on the sparsity/density of the\nassociated tensors, in order to understand how to effectively apply techniques\nsuch as vectorization. We also formally analyze GNN pipelining, and we\ngeneralize the established Message-Passing class of GNN models to cover\narbitrary pipeline depths, facilitating future optimizations. Finally, we\ninvestigate different forms of asynchronicity, navigating the path for future\nasynchronous parallel GNN pipelines. The outcomes of our analysis are\nsynthesized in a set of insights that help to maximize GNN performance, and a\ncomprehensive list of challenges and opportunities for further research into\nefficient GNN computations. Our work will help to advance the design of future\nGNNs.\n","authors":["Maciej Besta","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2205.09702v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11285v1","updated":"2023-07-21T01:04:52Z","published":"2023-07-21T01:04:52Z","title":"MAS: Towards Resource-Efficient Federated Multiple-Task Learning","summary":" Federated learning (FL) is an emerging distributed machine learning method\nthat empowers in-situ model training on decentralized edge devices. However,\nmultiple simultaneous FL tasks could overload resource-constrained devices. In\nthis work, we propose the first FL system to effectively coordinate and train\nmultiple simultaneous FL tasks. We first formalize the problem of training\nsimultaneous FL tasks. Then, we present our new approach, MAS (Merge and\nSplit), to optimize the performance of training multiple simultaneous FL tasks.\nMAS starts by merging FL tasks into an all-in-one FL task with a multi-task\narchitecture. After training for a few rounds, MAS splits the all-in-one FL\ntask into two or more FL tasks by using the affinities among tasks measured\nduring the all-in-one training. It then continues training each split of FL\ntasks based on model parameters from the all-in-one training. Extensive\nexperiments demonstrate that MAS outperforms other methods while reducing\ntraining time by 2x and reducing energy consumption by 40%. We hope this work\nwill inspire the community to further study and optimize training simultaneous\nFL tasks.\n","authors":["Weiming Zhuang","Yonggang Wen","Lingjuan Lyu","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11285v1.pdf","comment":"ICCV'23. arXiv admin note: substantial text overlap with\n arXiv:2207.04202"},{"id":"http://arxiv.org/abs/2307.11280v1","updated":"2023-07-21T00:49:07Z","published":"2023-07-21T00:49:07Z","title":"Epsilon*: Privacy Metric for Machine Learning Models","summary":" We introduce Epsilon*, a new privacy metric for measuring the privacy risk of\na single model instance prior to, during, or after deployment of privacy\nmitigation strategies. The metric does not require access to the training data\nsampling or model training algorithm. Epsilon* is a function of true positive\nand false positive rates in a hypothesis test used by an adversary in a\nmembership inference attack. We distinguish between quantifying the privacy\nloss of a trained model instance and quantifying the privacy loss of the\ntraining mechanism which produces this model instance. Existing approaches in\nthe privacy auditing literature provide lower bounds for the latter, while our\nmetric provides a lower bound for the former by relying on an\n(${\\epsilon}$,${\\delta}$)-type of quantification of the privacy of the trained\nmodel instance. We establish a relationship between these lower bounds and show\nhow to implement Epsilon* to avoid numerical and noise amplification\ninstability. We further show in experiments on benchmark public data sets that\nEpsilon* is sensitive to privacy risk mitigation by training with differential\nprivacy (DP), where the value of Epsilon* is reduced by up to 800% compared to\nthe Epsilon* values of non-DP trained baseline models. This metric allows\nprivacy auditors to be independent of model owners, and enables all\ndecision-makers to visualize the privacy-utility landscape to make informed\ndecisions regarding the trade-offs between model privacy and utility.\n","authors":["Diana M. Negoescu","Humberto Gonzalez","Saad Eddin Al Orjany","Jilei Yang","Yuliia Lut","Rahul Tandra","Xiaowen Zhang","Xinyi Zheng","Zach Douglas","Vidita Nolkha","Parvez Ahammad","Gennady Samorodnitsky"],"pdf_url":"https://arxiv.org/pdf/2307.11280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11274v1","updated":"2023-07-21T00:15:56Z","published":"2023-07-21T00:15:56Z","title":"Screening Mammography Breast Cancer Detection","summary":" Breast cancer is a leading cause of cancer-related deaths, but current\nprograms are expensive and prone to false positives, leading to unnecessary\nfollow-up and patient anxiety. This paper proposes a solution to automated\nbreast cancer detection, to improve the efficiency and accuracy of screening\nprograms. Different methodologies were tested against the RSNA dataset of\nradiographic breast images of roughly 20,000 female patients and yielded an\naverage validation case pF1 score of 0.56 across methods.\n","authors":["Debajyoti Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2307.11274v1.pdf","comment":"Released @ Apr 2023. For associated project files, see\n https://github.com/chakrabortyde/rsna-breast-cancer"},{"id":"http://arxiv.org/abs/2305.13503v2","updated":"2023-07-21T00:15:28Z","published":"2023-05-22T21:39:38Z","title":"Asynchronous Multi-Model Dynamic Federated Learning over Wireless\n Networks: Theory, Modeling, and Optimization","summary":" Federated learning (FL) has emerged as a key technique for distributed\nmachine learning (ML). Most literature on FL has focused on ML model training\nfor (i) a single task/model, with (ii) a synchronous scheme for uplink/downlink\ntransfer of model parameters, and (iii) a static data distribution setting\nacross devices. These assumptions are often not well representative of\nconditions encountered in practical FL environments. To address this, we\ndevelop DMA-FL, which considers dynamic FL with multiple downstream tasks to be\ntrained over an asynchronous model transmission architecture. We first\ncharacterize the convergence of ML model training under DMA-FL via introducing\na family of scheduling tensors and rectangular functions to capture the\nscheduling of devices. Our convergence analysis sheds light on the impact of\nresource allocation, device scheduling, and individual model states on the\nperformance of ML models. We then formulate a non-convex mixed integer\noptimization problem for jointly configuring the resource allocation and device\nscheduling to strike an efficient trade-off between energy consumption and ML\nperformance. We develop a solution methodology employing successive convex\napproximations with convergence guarantee to a stationary point. Through\nnumerical simulations, we reveal the advantages of DMA-FL in terms of model\nperformance and network resource savings.\n","authors":["Zhan-Lun Chang","Seyyedali Hosseinalipour","Mung Chiang","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2305.13503v2.pdf","comment":"Submission to IEEE Transactions on Cognitive Communications and\n Networking"}],"Multimedia":[{"id":"http://arxiv.org/abs/2304.14133v2","updated":"2023-07-21T12:06:17Z","published":"2023-04-27T12:28:29Z","title":"VERITE: A Robust Benchmark for Multimodal Misinformation Detection\n Accounting for Unimodal Bias","summary":" Multimedia content has become ubiquitous on social media platforms, leading\nto the rise of multimodal misinformation (MM) and the urgent need for effective\nstrategies to detect and prevent its spread. In recent years, the challenge of\nmultimodal misinformation detection (MMD) has garnered significant attention by\nresearchers and has mainly involved the creation of annotated, weakly\nannotated, or synthetically generated training datasets, along with the\ndevelopment of various deep learning MMD models. However, the problem of\nunimodal bias in MMD benchmarks -- where biased or unimodal methods outperform\ntheir multimodal counterparts on an inherently multimodal task -- has been\noverlooked. In this study, we systematically investigate and identify the\npresence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS),\nraising concerns about their suitability for reliable evaluation. To address\nthis issue, we introduce the \"VERification of Image-TExtpairs\" (VERITE)\nbenchmark for MMD which incorporates real-world data, excludes \"asymmetric\nmultimodal misinformation\" and utilizes \"modality balancing\". We conduct an\nextensive comparative study with a Transformer-based architecture that shows\nthe ability of VERITE to effectively address unimodal bias, rendering it a\nrobust evaluation framework for MMD. Furthermore, we introduce a new method --\ntermed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating\nrealistic synthetic training data that preserve crossmodal relations between\nlegitimate images and false human-written captions. By leveraging CHASMA in the\ntraining process, we observe consistent and notable improvements in predictive\nperformance on VERITE; with a 9.2% increase in accuracy. We release our code\nat: https://github.com/stevejpapad/image-text-verification\n","authors":["Stefanos-Iordanis Papadopoulos","Christos Koutlis","Symeon Papadopoulos","Panagiotis C. Petrantonakis"],"pdf_url":"https://arxiv.org/pdf/2304.14133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09382v3","updated":"2023-07-21T07:59:06Z","published":"2023-06-15T12:59:04Z","title":"Sound Demixing Challenge 2023 Music Demixing Track Technical Report:\n TFC-TDF-UNet v3","summary":" In this report, we present our award-winning solutions for the Music Demixing\nTrack of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a\ntime-efficient music source separation model that achieves state-of-the-art\nresults on the MUSDB benchmark. We then give full details regarding our\nsolutions for each Leaderboard, including a loss masking approach for\nnoise-robust training. Code for reproducing model training and final\nsubmissions is available at github.com/kuielab/sdx23.\n","authors":["Minseok Kim","Jun Hyung Lee","Soonyoung Jung"],"pdf_url":"https://arxiv.org/pdf/2306.09382v3.pdf","comment":"5 pages, 4 tables"},{"id":"http://arxiv.org/abs/2301.12688v3","updated":"2023-07-21T18:13:10Z","published":"2023-01-30T06:37:35Z","title":"Dynamic Storyboard Generation in an Engine-based Virtual Environment for\n Video Production","summary":" Amateurs working on mini-films and short-form videos usually spend lots of\ntime and effort on the multi-round complicated process of setting and adjusting\nscenes, plots, and cameras to deliver satisfying video shots. We present\nVirtual Dynamic Storyboard (VDS) to allow users storyboarding shots in virtual\nenvironments, where the filming staff can easily test the settings of shots\nbefore the actual filming. VDS runs on a \"propose-simulate-discriminate\" mode:\nGiven a formatted story script and a camera script as input, it generates\nseveral character animation and camera movement proposals following predefined\nstory and cinematic rules to allow an off-the-shelf simulation engine to render\nvideos. To pick up the top-quality dynamic storyboard from the candidates, we\nequip it with a shot ranking discriminator based on shot quality criteria\nlearned from professional manual-created data. VDS is comprehensively validated\nvia extensive experiments and user studies, demonstrating its efficiency,\neffectiveness, and great potential in assisting amateur video production.\n","authors":["Anyi Rao","Xuekun Jiang","Yuwei Guo","Linning Xu","Lei Yang","Libiao Jin","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2301.12688v3.pdf","comment":"Project page: https://virtualfilmstudio.github.io/"}]},"2023-07-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":" Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2307.12976v1","updated":"2023-07-24T17:52:46Z","published":"2023-07-24T17:52:46Z","title":"Evaluating the Ripple Effects of Knowledge Editing in Language Models","summary":" Modern language models capture a large body of factual knowledge. However,\nsome facts can be incorrectly induced or become obsolete over time, resulting\nin factually incorrect generations. This has led to the development of various\nediting methods that allow updating facts encoded by the model. Evaluation of\nthese methods has primarily focused on testing whether an individual fact has\nbeen successfully injected, and if similar predictions for other subjects have\nnot changed. Here we argue that such evaluation is limited, since injecting one\nfact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple\neffect'' in the form of additional facts that the model needs to update\n(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we\npropose a novel set of evaluation criteria that consider the implications of an\nedit on related facts. Using these criteria, we then construct \\ripple{}, a\ndiagnostic benchmark of 5K factual edits, capturing a variety of types of\nripple effects. We evaluate prominent editing methods on \\ripple{}, showing\nthat current methods fail to introduce consistent changes in the model's\nknowledge. In addition, we find that a simple in-context editing baseline\nobtains the best scores on our benchmark, suggesting a promising research\ndirection for model editing.\n","authors":["Roi Cohen","Eden Biran","Ori Yoran","Amir Globerson","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2307.12976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12973v1","updated":"2023-07-24T17:49:31Z","published":"2023-07-24T17:49:31Z","title":"Leveraging Label Variation in Large Language Models for Zero-Shot Text\n Classification","summary":" The zero-shot learning capabilities of large language models (LLMs) make them\nideal for text classification without annotation or supervised training. Many\nstudies have shown impressive results across multiple tasks. While tasks, data,\nand results differ widely, their similarities to human annotation can aid us in\ntackling new tasks with minimal expenses. We evaluate using 5 state-of-the-art\nLLMs as \"annotators\" on 5 different tasks (age, gender, topic, sentiment\nprediction, and hate speech detection), across 4 languages: English, French,\nGerman, and Spanish. No single model excels at all tasks, across languages, or\nacross all labels within a task. However, aggregation techniques designed for\nhuman annotators perform substantially better than any one individual model.\nOverall, though, LLMs do not rival even simple supervised models, so they do\nnot (yet) replace the need for human annotation. We also discuss the tradeoffs\nbetween speed, accuracy, cost, and bias when it comes to aggregated model\nlabeling versus human annotation.\n","authors":["Flor Miriam Plaza-del-Arco","Debora Nozza","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2307.12973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12966v1","updated":"2023-07-24T17:44:58Z","published":"2023-07-24T17:44:58Z","title":"Aligning Large Language Models with Human: A Survey","summary":" Large Language Models (LLMs) trained on extensive textual corpora have\nemerged as leading solutions for a broad array of Natural Language Processing\n(NLP) tasks. Despite their notable performance, these models are prone to\ncertain limitations such as misunderstanding human instructions, generating\npotentially biased content, or factually incorrect (hallucinated) information.\nHence, aligning LLMs with human expectations has become an active area of\ninterest within the research community. This survey presents a comprehensive\noverview of these alignment technologies, including the following aspects. (1)\nData collection: the methods for effectively collecting high-quality\ninstructions for LLM alignment, including the use of NLP benchmarks, human\nannotations, and leveraging strong LLMs. (2) Training methodologies: a detailed\nreview of the prevailing training methods employed for LLM alignment. Our\nexploration encompasses Supervised Fine-tuning, both Online and Offline human\npreference training, along with parameter-efficient training mechanisms. (3)\nModel Evaluation: the methods for evaluating the effectiveness of these\nhuman-aligned LLMs, presenting a multifaceted approach towards their\nassessment. In conclusion, we collate and distill our findings, shedding light\non several promising future research avenues in the field. This survey,\ntherefore, serves as a valuable resource for anyone invested in understanding\nand advancing the alignment of LLMs to better suit human-oriented tasks and\nexpectations. An associated GitHub link collecting the latest papers is\navailable at https://github.com/GaryYufei/AlignLLMHumanSurvey.\n","authors":["Yufei Wang","Wanjun Zhong","Liangyou Li","Fei Mi","Xingshan Zeng","Wenyong Huang","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12966v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2303.04245v2","updated":"2023-07-24T17:29:04Z","published":"2023-03-07T21:42:17Z","title":"How Do Transformers Learn Topic Structure: Towards a Mechanistic\n Understanding","summary":" While the successes of transformers across many domains are indisputable,\naccurate understanding of the learning mechanics is still largely lacking.\nTheir capabilities have been probed on benchmarks which include a variety of\nstructured and reasoning tasks -- but mathematical understanding is lagging\nsubstantially behind. Recent lines of work have begun studying representational\naspects of this question: that is, the size/depth/complexity of attention-based\nnetworks to perform certain tasks. However, there is no guarantee the learning\ndynamics will converge to the constructions proposed. In our paper, we provide\nfine-grained mechanistic understanding of how transformers learn \"semantic\nstructure\", understood as capturing co-occurrence structure of words.\nPrecisely, we show, through a combination of mathematical analysis and\nexperiments on Wikipedia data and synthetic data modeled by Latent Dirichlet\nAllocation (LDA), that the embedding layer and the self-attention layer encode\nthe topical structure. In the former case, this manifests as higher average\ninner product of embeddings between same-topic words. In the latter, it\nmanifests as higher average pairwise attention between same-topic words. The\nmathematical results involve several assumptions to make the analysis\ntractable, which we verify on data, and might be of independent interest as\nwell.\n","authors":["Yuchen Li","Yuanzhi Li","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2303.04245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12950v1","updated":"2023-07-24T17:23:22Z","published":"2023-07-24T17:23:22Z","title":"RLCD: Reinforcement Learning from Contrast Distillation for Language\n Model Alignment","summary":" We propose Reinforcement Learning from Contrast Distillation (RLCD), a method\nfor aligning language models to follow natural language principles without\nusing human feedback. RLCD trains a preference model using simulated preference\npairs that contain both a high-quality and low-quality example, generated using\ncontrasting positive and negative prompts. The preference model is then used to\nimprove a base unaligned language model via reinforcement learning.\nEmpirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context\ndistillation (Huang et al., 2022) baselines across three diverse alignment\ntasks--harmlessness, helpfulness, and story outline generation--and on both 7B\nand 30B model scales for preference data simulation.\n","authors":["Kevin Yang","Dan Klein","Asli Celikyilmaz","Nanyun Peng","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2307.12950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12949v1","updated":"2023-07-24T17:22:04Z","published":"2023-07-24T17:22:04Z","title":"Boosting Punctuation Restoration with Data Generation and Reinforcement\n Learning","summary":" Punctuation restoration is an important task in automatic speech recognition\n(ASR) which aim to restore the syntactic structure of generated ASR texts to\nimprove readability. While punctuated texts are abundant from written\ndocuments, the discrepancy between written punctuated texts and ASR texts\nlimits the usability of written texts in training punctuation restoration\nsystems for ASR texts. This paper proposes a reinforcement learning method to\nexploit in-topic written texts and recent advances in large pre-trained\ngenerative language models to bridge this gap. The experiments show that our\nmethod achieves state-of-the-art performance on the ASR test set on two\nbenchmark datasets for punctuation restoration.\n","authors":["Viet Dac Lai","Abel Salinas","Hao Tan","Trung Bui","Quan Tran","Seunghyun Yoon","Hanieh Deilamsalehy","Franck Dernoncourt","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.12949v1.pdf","comment":"Accepted at INTERSPEECH 2023, 6 pages"},{"id":"http://arxiv.org/abs/2307.12935v1","updated":"2023-07-24T16:55:37Z","published":"2023-07-24T16:55:37Z","title":"Rule By Example: Harnessing Logical Rules for Explainable Hate Speech\n Detection","summary":" Classic approaches to content moderation typically apply a rule-based\nheuristic approach to flag content. While rules are easily customizable and\nintuitive for humans to interpret, they are inherently fragile and lack the\nflexibility or robustness needed to moderate the vast amount of undesirable\ncontent found online today. Recent advances in deep learning have demonstrated\nthe promise of using highly effective deep neural models to overcome these\nchallenges. However, despite the improved performance, these data-driven models\nlack transparency and explainability, often leading to mistrust from everyday\nusers and a lack of adoption by many platforms. In this paper, we present Rule\nBy Example (RBE): a novel exemplar-based contrastive learning approach for\nlearning from logical rules for the task of textual content moderation. RBE is\ncapable of providing rule-grounded predictions, allowing for more explainable\nand customizable predictions compared to typical deep learning-based\napproaches. We demonstrate that our approach is capable of learning rich rule\nembedding representations using only a few data examples. Experimental results\non 3 popular hate speech classification datasets show that RBE is able to\noutperform state-of-the-art deep learning classifiers as well as the use of\nrules in both supervised and unsupervised settings while providing explainable\nmodel predictions via rule-grounding.\n","authors":["Christopher Clarke","Matthew Hall","Gaurav Mittal","Ye Yu","Sandra Sajeev","Jason Mars","Mei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12935v1.pdf","comment":"ACL 2023 Main Conference"},{"id":"http://arxiv.org/abs/2307.12896v1","updated":"2023-07-24T15:44:23Z","published":"2023-07-24T15:44:23Z","title":"Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models","summary":" The article introduces corrections to Zipf's and Heaps' laws based on\nsystematic models of the hapax rate. The derivation rests on two assumptions:\nThe first one is the standard urn model which predicts that marginal frequency\ndistributions for shorter texts look as if word tokens were sampled blindly\nfrom a given longer text. The second assumption posits that the rate of hapaxes\nis a simple function of the text size. Four such functions are discussed: the\nconstant model, the Davis model, the linear model, and the logistic model. It\nis shown that the logistic model yields the best fit.\n","authors":["Łukasz Dębowski"],"pdf_url":"https://arxiv.org/pdf/2307.12896v1.pdf","comment":"41 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2304.08649v3","updated":"2023-07-24T15:33:25Z","published":"2023-04-17T22:53:54Z","title":"Classification of US Supreme Court Cases using BERT-Based Techniques","summary":" Models based on bidirectional encoder representations from transformers\n(BERT) produce state of the art (SOTA) results on many natural language\nprocessing (NLP) tasks such as named entity recognition (NER), part-of-speech\n(POS) tagging etc. An interesting phenomenon occurs when classifying long\ndocuments such as those from the US supreme court where BERT-based models can\nbe considered difficult to use on a first-pass or out-of-the-box basis. In this\npaper, we experiment with several BERT-based classification techniques for US\nsupreme court decisions or supreme court database (SCDB) and compare them with\nthe previous SOTA results. We then compare our results specifically with SOTA\nmodels for long documents. We compare our results for two classification tasks:\n(1) a broad classification task with 15 categories and (2) a fine-grained\nclassification task with 279 categories. Our best result produces an accuracy\nof 80\\% on the 15 broad categories and 60\\% on the fine-grained 279 categories\nwhich marks an improvement of 8\\% and 28\\% respectively from previously\nreported SOTA results.\n","authors":["Shubham Vatsal","Adam Meyers","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2304.08649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10490v3","updated":"2023-07-24T15:24:17Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n Multi-Modal LLMs","summary":" We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12856v1","updated":"2023-07-24T14:56:30Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n Program Synthesis","summary":" Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web navigation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that can complete the tasks on real\nwebsites following natural language instructions. WebAgent plans ahead by\ndecomposing instructions into canonical sub-instructions, summarizes long HTML\ndocuments into task-relevant snippets, and acts on websites via generated\nPython programs from those. We design WebAgent with Flan-U-PaLM, for grounded\ncode generation, and HTML-T5, new pre-trained LLMs for long HTML documents\nusing local and global attention mechanisms and a mixture of long-span\ndenoising objectives, for planning and summarization. We empirically\ndemonstrate that our recipe improves the success on a real website by over 50%,\nand that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%\nhigher success rate than prior SoTA on the MiniWoB web navigation benchmark and\nbetter accuracy on offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12835v1","updated":"2023-07-24T14:33:49Z","published":"2023-07-24T14:33:49Z","title":"Joint Dropout: Improving Generalizability in Low-Resource Neural Machine\n Translation through Phrase Pair Variables","summary":" Despite the tremendous success of Neural Machine Translation (NMT), its\nperformance on low-resource language pairs still remains subpar, partly due to\nthe limited ability to handle previously unseen inputs, i.e., generalization.\nIn this paper, we propose a method called Joint Dropout, that addresses the\nchallenge of low-resource neural machine translation by substituting phrases\nwith variables, resulting in significant enhancement of compositionality, which\nis a key aspect of generalization. We observe a substantial improvement in\ntranslation quality for language pairs with minimal resources, as seen in BLEU\nand Direct Assessment scores. Furthermore, we conduct an error analysis, and\nfind Joint Dropout to also enhance generalizability of low-resource NMT in\nterms of robustness and adaptability across different domains\n","authors":["Ali Araabi","Vlad Niculae","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2307.12835v1.pdf","comment":"Accepted at MT Summit 2023"},{"id":"http://arxiv.org/abs/2307.12803v1","updated":"2023-07-24T13:54:37Z","published":"2023-07-24T13:54:37Z","title":"Guidance in Radiology Report Summarization: An Empirical Evaluation and\n Error Analysis","summary":" Automatically summarizing radiology reports into a concise impression can\nreduce the manual burden of clinicians and improve the consistency of\nreporting. Previous work aimed to enhance content selection and factuality\nthrough guided abstractive summarization. However, two key issues persist.\nFirst, current methods heavily rely on domain-specific resources to extract the\nguidance signal, limiting their transferability to domains and languages where\nthose resources are unavailable. Second, while automatic metrics like ROUGE\nshow progress, we lack a good understanding of the errors and failure modes in\nthis task. To bridge these gaps, we first propose a domain-agnostic guidance\nsignal in form of variable-length extractive summaries. Our empirical results\non two English benchmarks demonstrate that this guidance signal improves upon\nunguided summarization while being competitive with domain-specific methods.\nAdditionally, we run an expert evaluation of four systems according to a\ntaxonomy of 11 fine-grained errors. We find that the most pressing differences\nbetween automatic summaries and those of radiologists relate to content\nselection including omissions (up to 52%) and additions (up to 57%). We\nhypothesize that latent reporting factors and corpus-level inconsistencies may\nlimit models to reliably learn content selection from the available data,\npresenting promising directions for future work.\n","authors":["Jan Trienes","Paul Youssef","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.12803v1.pdf","comment":"Accepted at INLG2023"},{"id":"http://arxiv.org/abs/2307.12798v1","updated":"2023-07-24T13:51:19Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":" The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.12662v4","updated":"2023-07-24T13:22:58Z","published":"2020-11-25T11:44:12Z","title":"XTQA: Span-Level Explanations of the Textbook Question Answering","summary":" Textbook Question Answering (TQA) is a task that one should answer a\ndiagram/non-diagram question given a large multi-modal context consisting of\nabundant essays and diagrams. We argue that the explainability of this task\nshould place students as a key aspect to be considered. To address this issue,\nwe devise a novel architecture towards span-level eXplanations of the TQA\n(XTQA) based on our proposed coarse-to-fine grained algorithm, which can\nprovide not only the answers but also the span-level evidences to choose them\nfor students. This algorithm first coarsely chooses top $M$ paragraphs relevant\nto questions using the TF-IDF method, and then chooses top $K$ evidence spans\nfinely from all candidate spans within these paragraphs by computing the\ninformation gain of each span to questions. Experimental results shows that\nXTQA significantly improves the state-of-the-art performance compared with\nbaselines. The source code is available at\nhttps://github.com/keep-smile-001/opentqa\n","authors":["Jie Ma","Qi Chai","Jun Liu","Qingyu Yin","Pinghui Wang","Qinghua Zheng"],"pdf_url":"https://arxiv.org/pdf/2011.12662v4.pdf","comment":"Accepted by IEEE TNNLS"},{"id":"http://arxiv.org/abs/2307.12759v1","updated":"2023-07-24T13:04:21Z","published":"2023-07-24T13:04:21Z","title":"Code-Switched Urdu ASR for Noisy Telephonic Environment using Data\n Centric Approach with Hybrid HMM and CNN-TDNN","summary":" Call Centers have huge amount of audio data which can be used for achieving\nvaluable business insights and transcription of phone calls is manually tedious\ntask. An effective Automated Speech Recognition system can accurately\ntranscribe these calls for easy search through call history for specific\ncontext and content allowing automatic call monitoring, improving QoS through\nkeyword search and sentiment analysis. ASR for Call Center requires more\nrobustness as telephonic environment are generally noisy. Moreover, there are\nmany low-resourced languages that are on verge of extinction which can be\npreserved with help of Automatic Speech Recognition Technology. Urdu is the\n$10^{th}$ most widely spoken language in the world, with 231,295,440 worldwide\nstill remains a resource constrained language in ASR. Regional call-center\nconversations operate in local language, with a mix of English numbers and\ntechnical terms generally causing a \"code-switching\" problem. Hence, this paper\ndescribes an implementation framework of a resource efficient Automatic Speech\nRecognition/ Speech to Text System in a noisy call-center environment using\nChain Hybrid HMM and CNN-TDNN for Code-Switched Urdu Language. Using Hybrid\nHMM-DNN approach allowed us to utilize the advantages of Neural Network with\nless labelled data. Adding CNN with TDNN has shown to work better in noisy\nenvironment due to CNN's additional frequency dimension which captures extra\ninformation from noisy speech, thus improving accuracy. We collected data from\nvarious open sources and labelled some of the unlabelled data after analysing\nits general context and content from Urdu language as well as from commonly\nused words from other languages, primarily English and were able to achieve WER\nof 5.2% with noisy as well as clean environment in isolated words or numbers as\nwell as in continuous spontaneous speech.\n","authors":["Muhammad Danyal Khan","Raheem Ali","Arshad Aziz"],"pdf_url":"https://arxiv.org/pdf/2307.12759v1.pdf","comment":"32 pages, 19 figures, 2 tables, preprint"},{"id":"http://arxiv.org/abs/2305.16731v3","updated":"2023-07-24T11:20:10Z","published":"2023-05-26T08:33:28Z","title":"Automatic Emotion Experiencer Recognition","summary":" The most prominent subtask in emotion analysis is emotion classification; to\nassign a category to a textual unit, for instance a social media post. Many\nresearch questions from the social sciences do, however, not only require the\ndetection of the emotion of an author of a post but to understand who is\nascribed an emotion in text. This task is tackled by emotion role labeling\nwhich aims at extracting who is described in text to experience an emotion,\nwhy, and towards whom. This could, however, be considered overly sophisticated\nif the main question to answer is who feels which emotion. A targeted approach\nfor such setup is to classify emotion experiencer mentions (aka \"emoters\")\nregarding the emotion they presumably perceive. This task is similar to named\nentity recognition of person names with the difference that not every mentioned\nentity name is an emoter. While, very recently, data with emoter annotations\nhas been made available, no experiments have yet been performed to detect such\nmentions. With this paper, we provide baseline experiments to understand how\nchallenging the task is. We further evaluate the impact on experiencer-specific\nemotion categorization and appraisal detection in a pipeline, when gold\nmentions are not available. We show that experiencer detection in text is a\nchallenging task, with a precision of .82 and a recall of .56 (F1 =.66). These\nresults motivate future work of jointly modeling emoter spans and\nemotion/appraisal predictions.\n","authors":["Maximilian Wegge","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2305.16731v3.pdf","comment":"accepted to the CPSS workshop at KONVENS"},{"id":"http://arxiv.org/abs/2307.12659v1","updated":"2023-07-24T10:03:28Z","published":"2023-07-24T10:03:28Z","title":"A Model for Every User and Budget: Label-Free and Personalized\n Mixed-Precision Quantization","summary":" Recent advancement in Automatic Speech Recognition (ASR) has produced large\nAI models, which become impractical for deployment in mobile devices. Model\nquantization is effective to produce compressed general-purpose models, however\nsuch models may only be deployed to a restricted sub-domain of interest. We\nshow that ASR models can be personalized during quantization while relying on\njust a small set of unlabelled samples from the target domain. To this end, we\npropose myQASR, a mixed-precision quantization method that generates tailored\nquantization schemes for diverse users under any memory requirement with no\nfine-tuning. myQASR automatically evaluates the quantization sensitivity of\nnetwork layers by analysing the full-precision activation values. We are then\nable to generate a personalised mixed-precision quantization scheme for any\npre-determined memory budget. Results for large-scale ASR models show how\nmyQASR improves performance for specific genders, languages, and speakers.\n","authors":["Edward Fish","Umberto Michieli","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.12659v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2301.09790v3","updated":"2023-07-24T10:03:01Z","published":"2023-01-24T02:44:02Z","title":"The Next Chapter: A Study of Large Language Models in Storytelling","summary":" To enhance the quality of generated stories, recent story generation models\nhave been investigating the utilization of higher-level attributes like plots\nor commonsense knowledge. The application of prompt-based learning with large\nlanguage models (LLMs), exemplified by GPT-3, has exhibited remarkable\nperformance in diverse natural language processing (NLP) tasks. This paper\nconducts a comprehensive investigation, utilizing both automatic and human\nevaluation, to compare the story generation capacity of LLMs with recent models\nacross three datasets with variations in style, register, and length of\nstories. The results demonstrate that LLMs generate stories of significantly\nhigher quality compared to other story generation models. Moreover, they\nexhibit a level of performance that competes with human authors, albeit with\nthe preliminary observation that they tend to replicate real stories in\nsituations involving world knowledge, resembling a form of plagiarism.\n","authors":["Zhuohan Xie","Trevor Cohn","Jey Han Lau"],"pdf_url":"https://arxiv.org/pdf/2301.09790v3.pdf","comment":"Accepted to INLG2023"},{"id":"http://arxiv.org/abs/2304.14721v4","updated":"2023-07-24T09:49:55Z","published":"2023-04-28T09:42:18Z","title":"Towards autonomous system: flexible modular production system enhanced\n with large language model agents","summary":" In this paper, we present a novel framework that combines large language\nmodels (LLMs), digital twins and industrial automation system to enable\nintelligent planning and control of production processes. We retrofit the\nautomation system for a modular production facility and create executable\ncontrol interfaces of fine-granular functionalities and coarse-granular skills.\nLow-level functionalities are executed by automation components, and high-level\nskills are performed by automation modules. Subsequently, a digital twin system\nis developed, registering these interfaces and containing additional\ndescriptive information about the production system. Based on the retrofitted\nautomation system and the created digital twins, LLM-agents are designed to\ninterpret descriptive information in the digital twins and control the physical\nsystem through service interfaces. These LLM-agents serve as intelligent agents\non different levels within an automation system, enabling autonomous planning\nand control of flexible production. Given a task instruction as input, the\nLLM-agents orchestrate a sequence of atomic functionalities and skills to\naccomplish the task. We demonstrate how our implemented prototype can handle\nun-predefined tasks, plan a production process, and execute the operations.\nThis research highlights the potential of integrating LLMs into industrial\nautomation systems in the context of smart factory for more agile, flexible,\nand adaptive production processes, while it also underscores the critical\ninsights and limitations for future work. Demos at:\nhttps://github.com/YuchenXia/GPT4IndustrialAutomation\n","authors":["Yuchen Xia","Manthan Shenoy","Nasser Jazdi","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2304.14721v4.pdf","comment":"This is the pre-print draft manuscript. The peer-reviewed version\n will be published exclusively by IEEE after the conference, which is set to\n take place from September 12th to 15th, 2023. We've made several improvements\n to the final version of the paper based on valuable feedback and suggestions\n from other researchers"},{"id":"http://arxiv.org/abs/2307.12639v1","updated":"2023-07-24T09:30:30Z","published":"2023-07-24T09:30:30Z","title":"Fake News Detection Through Graph-based Neural Networks: A Survey","summary":" The popularity of online social networks has enabled rapid dissemination of\ninformation. People now can share and consume information much more rapidly\nthan ever before. However, low-quality and/or accidentally/deliberately fake\ninformation can also spread rapidly. This can lead to considerable and negative\nimpacts on society. Identifying, labelling and debunking online misinformation\nas early as possible has become an increasingly urgent problem. Many methods\nhave been proposed to detect fake news including many deep learning and\ngraph-based approaches. In recent years, graph-based methods have yielded\nstrong results, as they can closely model the social context and propagation\nprocess of online news. In this paper, we present a systematic review of fake\nnews detection studies based on graph-based and deep learning-based techniques.\nWe classify existing graph-based methods into knowledge-driven methods,\npropagation-based methods, and heterogeneous social context-based methods,\ndepending on how a graph structure is constructed to model news related\ninformation flows. We further discuss the challenges and open problems in\ngraph-based fake news detection and identify future research directions.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2307.12639v1.pdf","comment":"18 pages, 3 tables, 7 figures"},{"id":"http://arxiv.org/abs/2210.04676v2","updated":"2023-07-24T09:00:03Z","published":"2022-10-10T13:26:45Z","title":"Learning \"O\" Helps for Learning More: Handling the Concealed Entity\n Problem for Class-incremental NER","summary":" As the categories of named entities rapidly increase, the deployed NER models\nare required to keep updating toward recognizing more entity types, creating a\ndemand for class-incremental learning for NER. Considering the privacy concerns\nand storage constraints, the standard paradigm for class-incremental NER\nupdates the models with training data only annotated with the new classes, yet\nthe entities from other entity classes are unlabeled, regarded as \"Non-entity\"\n(or \"O\"). In this work, we conduct an empirical study on the \"Unlabeled Entity\nProblem\" and find that it leads to severe confusion between \"O\" and entities,\ndecreasing class discrimination of old classes and declining the model's\nability to learn new classes. To solve the Unlabeled Entity Problem, we propose\na novel representation learning method to learn discriminative representations\nfor the entity classes and \"O\". Specifically, we propose an entity-aware\ncontrastive learning method that adaptively detects entity clusters in \"O\".\nFurthermore, we propose two effective distance-based relabeling strategies for\nbetter learning the old classes. We introduce a more realistic and challenging\nbenchmark for class-incremental NER, and the proposed method achieves up to\n10.62\\% improvement over the baseline methods.\n","authors":["Ruotian Ma","Xuanting Chen","Lin Zhang","Xin Zhou","Junzhe Wang","Tao Gui","Qi Zhang","Xiang Gao","Yunwen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.04676v2.pdf","comment":"Accepted by ACL 2023"},{"id":"http://arxiv.org/abs/2306.16108v2","updated":"2023-07-24T08:14:44Z","published":"2023-06-28T11:24:48Z","title":"Is ChatGPT a Biomedical Expert? -- Exploring the Zero-Shot Performance\n of Current GPT Models in Biomedical Tasks","summary":" We assessed the performance of commercial Large Language Models (LLMs)\nGPT-3.5-Turbo and GPT-4 on tasks from the 2023 BioASQ challenge. In Task 11b\nPhase B, which is focused on answer generation, both models demonstrated\ncompetitive abilities with leading systems. Remarkably, they achieved this with\nsimple zero-shot learning, grounded with relevant snippets. Even without\nrelevant snippets, their performance was decent, though not on par with the\nbest systems. Interestingly, the older and cheaper GPT-3.5-Turbo system was\nable to compete with GPT-4 in the grounded Q&A setting on factoid and list\nanswers. In Task 11b Phase A, focusing on retrieval, query expansion through\nzero-shot learning improved performance, but the models fell short compared to\nother systems. The code needed to rerun these experiments is available through\nGitHub.\n","authors":["Samy Ateia","Udo Kruschwitz"],"pdf_url":"https://arxiv.org/pdf/2306.16108v2.pdf","comment":"Preprint accepted at the 11th BioASQ Workshop at the 14th Conference\n and Labs of the Evaluation Forum (CLEF) 2023; Changes: 1. Added related work\n and experimental setup sections. 2. Reworked discussion and future work\n section. 3. Fixed multiple typos and improved style. Changed license"},{"id":"http://arxiv.org/abs/2307.12573v1","updated":"2023-07-24T07:40:59Z","published":"2023-07-24T07:40:59Z","title":"Tachikuma: Understading Complex Interactions with Multi-Character and\n Novel Objects by Large Language Models","summary":" Recent advancements in natural language and Large Language Models (LLMs) have\nenabled AI agents to simulate human-like interactions within virtual worlds.\nHowever, these interactions still face limitations in complexity and\nflexibility, particularly in scenarios involving multiple characters and novel\nobjects. Pre-defining all interactable objects in the agent's world model\npresents challenges, and conveying implicit intentions to multiple characters\nthrough complex interactions remains difficult. To address these issues, we\npropose integrating virtual Game Masters (GMs) into the agent's world model,\ndrawing inspiration from Tabletop Role-Playing Games (TRPGs). GMs play a\ncrucial role in overseeing information, estimating players' intentions,\nproviding environment descriptions, and offering feedback, compensating for\ncurrent world model deficiencies. To facilitate future explorations for complex\ninteractions, we introduce a benchmark named Tachikuma, comprising a Multiple\ncharacter and novel Object based interaction Estimation (MOE) task and a\nsupporting dataset. MOE challenges models to understand characters' intentions\nand accurately determine their actions within intricate contexts involving\nmulti-character and novel object interactions. Besides, the dataset captures\nlog data from real-time communications during gameplay, providing diverse,\ngrounded, and complex interactions for further explorations. Finally, we\npresent a simple prompting baseline and evaluate its performance, demonstrating\nits effectiveness in enhancing interaction understanding. We hope that our\ndataset and task will inspire further research in complex interactions with\nnatural language, fostering the development of more advanced AI agents.\n","authors":["Yuanzhi Liang","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12573v1.pdf","comment":"Preliminary version of an ongoing work"},{"id":"http://arxiv.org/abs/2307.12564v1","updated":"2023-07-24T07:17:33Z","published":"2023-07-24T07:17:33Z","title":"Towards Generalising Neural Topical Representations","summary":" Topic models have evolved from conventional Bayesian probabilistic models to\nNeural Topic Models (NTMs) over the last two decays. Although NTMs have\nachieved promising performance when trained and tested on a specific corpus,\ntheir generalisation ability across corpora is rarely studied. In practice, we\noften expect that an NTM trained on a source corpus can still produce quality\ntopical representation for documents in a different target corpus without\nretraining. In this work, we aim to improve NTMs further so that their benefits\ngeneralise reliably across corpora and tasks. To do so, we propose to model\nsimilar documents by minimising their semantical distance when training NTMs.\nSpecifically, similar documents are created by data augmentation during\ntraining; The semantical distance between documents is measured by the\nHierarchical Topic Transport Distance (HOTT), which computes the Optimal\nTransport (OT) distance between the topical representations. Our framework can\nbe readily applied to most NTMs as a plug-and-play module. Extensive\nexperiments show that our framework significantly improves the generalisation\nability regarding neural topical representation across corpora.\n","authors":["Xiaohao Yang","He Zhao","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2307.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.11578v2","updated":"2023-07-24T06:53:10Z","published":"2021-03-22T04:44:43Z","title":"SparseGAN: Sparse Generative Adversarial Network for Text Generation","summary":" It is still a challenging task to learn a neural text generation model under\nthe framework of generative adversarial networks (GANs) since the entire\ntraining process is not differentiable. The existing training strategies either\nsuffer from unreliable gradient estimations or imprecise sentence\nrepresentations. Inspired by the principle of sparse coding, we propose a\nSparseGAN that generates semantic-interpretable, but sparse sentence\nrepresentations as inputs to the discriminator. The key idea is that we treat\nan embedding matrix as an over-complete dictionary, and use a linear\ncombination of very few selected word embeddings to approximate the output\nfeature representation of the generator at each time step. With such\nsemantic-rich representations, we not only reduce unnecessary noises for\nefficient adversarial training, but also make the entire training process fully\ndifferentiable. Experiments on multiple text generation datasets yield\nperformance improvements, especially in sequence-level metrics, such as BLEU.\n","authors":["Liping Yuan","Jiehang Zeng","Xiaoqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2103.11578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09710v3","updated":"2023-07-24T05:39:27Z","published":"2022-11-17T17:45:59Z","title":"Style Classification of Rabbinic Literature for Detection of Lost\n Midrash Tanhuma Material","summary":" Midrash collections are complex rabbinic works that consist of text in\nmultiple languages, which evolved through long processes of unstable oral and\nwritten transmission. Determining the origin of a given passage in such a\ncompilation is not always straightforward and is often a matter of dispute\namong scholars, yet it is essential for scholars' understanding of the passage\nand its relationship to other texts in the rabbinic corpus. To help solve this\nproblem, we propose a system for classification of rabbinic literature based on\nits style, leveraging recent advances in natural language processing for Hebrew\ntexts. Additionally, we demonstrate how this method can be applied to uncover\nlost material from a specific midrash genre, Tan\\d{h}uma-Yelammedenu, that has\nbeen preserved in later anthologies.\n","authors":["Shlomo Tannor","Nachum Dershowitz","Moshe Lavee"],"pdf_url":"https://arxiv.org/pdf/2211.09710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12520v1","updated":"2023-07-24T04:29:43Z","published":"2023-07-24T04:29:43Z","title":"Lost In Translation: Generating Adversarial Examples Robust to\n Round-Trip Translation","summary":" Language Models today provide a high accuracy across a large number of\ndownstream tasks. However, they remain susceptible to adversarial attacks,\nparticularly against those where the adversarial examples maintain considerable\nsimilarity to the original text. Given the multilingual nature of text, the\neffectiveness of adversarial examples across translations and how machine\ntranslations can improve the robustness of adversarial examples remain largely\nunexplored. In this paper, we present a comprehensive study on the robustness\nof current text adversarial attacks to round-trip translation. We demonstrate\nthat 6 state-of-the-art text-based adversarial attacks do not maintain their\nefficacy after round-trip translation. Furthermore, we introduce an\nintervention-based solution to this problem, by integrating Machine Translation\ninto the process of adversarial example generation and demonstrating increased\nrobustness to round-trip translation. Our results indicate that finding\nadversarial examples robust to translation can help identify the insufficiency\nof language models that is common across languages, and motivate further\nresearch into multilingual adversarial attacks.\n","authors":["Neel Bhandari","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12520v1.pdf","comment":"Published at International Conference on Acoustics, Speech, and\n Signal Processing (ICASSP) 2023"},{"id":"http://arxiv.org/abs/2009.04639v2","updated":"2023-07-24T03:56:31Z","published":"2020-09-10T02:22:21Z","title":"Improving Coreference Resolution by Leveraging Entity-Centric Features\n with Graph Neural Networks and Second-order Inference","summary":" One of the major challenges in coreference resolution is how to make use of\nentity-level features defined over clusters of mentions rather than mention\npairs. However, coreferent mentions usually spread far apart in an entire text,\nwhich makes it extremely difficult to incorporate entity-level features. We\npropose a graph neural network-based coreference resolution method that can\ncapture the entity-centric information by encouraging the sharing of features\nacross all mentions that probably refer to the same real-world entity. Mentions\nare linked to each other via the edges modeling how likely two linked mentions\npoint to the same entity. Modeling by such graphs, the features between\nmentions can be shared by message passing operations in an entity-centric\nmanner. A global inference algorithm up to second-order features is also\npresented to optimally cluster mentions into consistent groups. Experimental\nresults show our graph neural network-based method combing with the\nsecond-order decoding algorithm (named GNNCR) achieved close to\nstate-of-the-art performance on the English CoNLL-2012 Shared Task dataset.\n","authors":["Lu Liu","Zhenqiao Song","Xiaoqing Zheng","Jun He"],"pdf_url":"https://arxiv.org/pdf/2009.04639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12507v1","updated":"2023-07-24T03:44:17Z","published":"2023-07-24T03:44:17Z","title":"Investigating the Existence of \"Secret Language'' in Language Models","summary":" In this paper, we study the problem of secret language in NLP, where current\nlanguage models (LMs) seem to have a hidden vocabulary that allows them to\ninterpret absurd inputs as meaningful concepts. We investigate two research\nquestions: ``Does the secret language phenomenon exist in different language\nmodels?'' and ``Does secret language depend on specific context?'' To answer\nthese questions, we introduce a novel method named \\textit{SecretFinding}, a\ngradient-based approach that can automatically discover secret languages in\nLMs. We conduct experiments on five representative models (Electra, ALBERT,\nRoberta, DistillBERT, and CLIP) finetuned on four NLP benchmarks (SST-2, MRPC,\nSNLI, and SQuAD) and a language-grounding benchmark (MSCOCO). Our experimental\nresults show that even when we replace the most important words with others\nthat are semantically dissimilar to the original words in a sentence, LMs do\nnot consider the new sentence semantically dissimilar to the original, as the\noutput does not change with a high probability. This phenomenon holds true\nacross the five models and five tasks and gives a positive answer to the first\nresearch question. As for the second research question, we find that the secret\nlanguage discovered by \\textit{SecretFinding} is quite general and could even\nbe transferred to other models in the black-box settings, such as GPT-3 and\nChatGPT. Finally, we discuss the causes of secret language, how to eliminate\nit, the potential connection to memorization, and ethical implications.\nExamples of secret language found by SecretFinding are available on\nhttps://huggingface.co/spaces/anonymousauthors/ACL23_SecretLanguage.\n","authors":["Yimu Wang","Peng Shi","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13040v3","updated":"2023-07-24T03:31:42Z","published":"2023-05-22T13:47:51Z","title":"SpokenWOZ: A Large-Scale Speech-Text Benchmark for Spoken Task-Oriented\n Dialogue Agents","summary":" Task-oriented dialogue (TOD) models have made significant progress in recent\nyears. However, previous studies primarily focus on datasets written by\nannotators, which has resulted in a gap between academic research and\nreal-world spoken conversation scenarios. While several small-scale spoken TOD\ndatasets are proposed to address robustness issues such as ASR errors, they\nignore the unique challenges in spoken conversation. To tackle the limitations,\nwe introduce SpokenWOZ, a large-scale speech-text dataset for spoken TOD,\ncontaining 8 domains, 203k turns, 5.7k dialogues and 249 hours of audios from\nhuman-to-human spoken conversations. SpokenWOZ further incorporates common\nspoken characteristics such as word-by-word processing and reasoning in spoken\nlanguage. Based on these characteristics, we present cross-turn slot and\nreasoning slot detection as new challenges. We conduct experiments on various\nbaselines, including text-modal models, newly proposed dual-modal models, and\nLLMs, e.g., ChatGPT. The results show that the current models still have\nsubstantial room for improvement in spoken conversation, where the most\nadvanced dialogue state tracker only achieves 25.65% in joint goal accuracy and\nthe SOTA end-to-end model only correctly completes the user request in 52.1% of\ndialogues. The dataset, code, and leaderboard are available:\nhttps://spokenwoz.github.io/SpokenWOZ-github.io/.\n","authors":["Shuzheng Si","Wentao Ma","Haoyu Gao","Yuchuan Wu","Ting-En Lin","Yinpei Dai","Hangyu Li","Rui Yan","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2305.13040v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.07481v2","updated":"2023-07-24T03:26:17Z","published":"2020-09-16T05:58:00Z","title":"Unsupervised Summarization by Jointly Extracting Sentences and Keywords","summary":" We present RepRank, an unsupervised graph-based ranking model for extractive\nmulti-document summarization in which the similarity between words, sentences,\nand word-to-sentence can be estimated by the distances between their vector\nrepresentations in a unified vector space. In order to obtain desirable\nrepresentations, we propose a self-attention based learning method that\nrepresent a sentence by the weighted sum of its word embeddings, and the\nweights are concentrated to those words hopefully better reflecting the content\nof a document. We show that salient sentences and keywords can be extracted in\na joint and mutual reinforcement process using our learned representations, and\nprove that this process always converges to a unique solution leading to\nimprovement in performance. A variant of absorbing random walk and the\ncorresponding sampling-based algorithm are also described to avoid redundancy\nand increase diversity in the summaries. Experiment results with multiple\nbenchmark datasets show that RepRank achieved the best or comparable\nperformance in ROUGE.\n","authors":["Zongyi Li","Xiaoqing Zheng","Jun He"],"pdf_url":"https://arxiv.org/pdf/2009.07481v2.pdf","comment":"10 pages(includes 2 pages references), 1 figure"},{"id":"http://arxiv.org/abs/2307.12498v1","updated":"2023-07-24T03:07:40Z","published":"2023-07-24T03:07:40Z","title":"Robust Automatic Speech Recognition via WavAugment Guided Phoneme\n Adversarial Training","summary":" Developing a practically-robust automatic speech recognition (ASR) is\nchallenging since the model should not only maintain the original performance\non clean samples, but also achieve consistent efficacy under small volume\nperturbations and large domain shifts. To address this problem, we propose a\nnovel WavAugment Guided Phoneme Adversarial Training (wapat). wapat use\nadversarial examples in phoneme space as augmentation to make the model\ninvariant to minor fluctuations in phoneme representation and preserve the\nperformance on clean samples. In addition, wapat utilizes the phoneme\nrepresentation of augmented samples to guide the generation of adversaries,\nwhich helps to find more stable and diverse gradient-directions, resulting in\nimproved generalization. Extensive experiments demonstrate the effectiveness of\nwapat on End-to-end Speech Challenge Benchmark (ESB). Notably, SpeechLM-wapat\noutperforms the original model by 6.28% WER reduction on ESB, achieving the new\nstate-of-the-art.\n","authors":["Gege Qi","Yuefeng Chen","Xiaofeng Mao","Xiaojun Jia","Ranjie Duan","Rong Zhang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2307.12498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11610v2","updated":"2023-07-24T01:35:47Z","published":"2023-07-21T14:25:39Z","title":"CausE: Towards Causal Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) focuses on representing the entities and\nrelations of a knowledge graph (KG) into the continuous vector spaces, which\ncan be employed to predict the missing triples to achieve knowledge graph\ncompletion (KGC). However, KGE models often only briefly learn structural\ncorrelations of triple data and embeddings would be misled by the trivial\npatterns and noisy links in real-world KGs. To address this issue, we build the\nnew paradigm of KGE in the context of causality and embedding disentanglement.\nWe further propose a Causality-enhanced knowledge graph Embedding (CausE)\nframework. CausE employs causal intervention to estimate the causal effect of\nthe confounder embeddings and design new training objectives to make stable\npredictions. Experimental results demonstrate that CausE could outperform the\nbaseline models and achieve state-of-the-art KGC performance. We release our\ncode in https://github.com/zjukg/CausE.\n","authors":["Yichi Zhang","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11610v2.pdf","comment":"Accepted by CCKS 2023 as a research paper"},{"id":"http://arxiv.org/abs/2306.14096v4","updated":"2023-07-24T00:58:11Z","published":"2023-06-25T02:24:30Z","title":"Chinese Fine-Grained Financial Sentiment Analysis with Large Language\n Models","summary":" Entity-level fine-grained sentiment analysis in the financial domain is a\ncrucial subtask of sentiment analysis and currently faces numerous challenges.\nThe primary challenge stems from the lack of high-quality and large-scale\nannotated corpora specifically designed for financial text sentiment analysis,\nwhich in turn limits the availability of data necessary for developing\neffective text processing techniques. Recent advancements in large language\nmodels (LLMs) have yielded remarkable performance in natural language\nprocessing tasks, primarily centered around language pattern matching. In this\npaper, we propose a novel and extensive Chinese fine-grained financial\nsentiment analysis dataset, FinChina SA, for enterprise early warning. We\nthoroughly evaluate and experiment with well-known existing open-source LLMs\nusing our dataset. We firmly believe that our dataset will serve as a valuable\nresource to advance the exploration of real-world financial sentiment analysis\ntasks, which should be the focus of future research. The FinChina SA dataset is\npublicly available at https://github.com/YerayL/FinChina-SA\n","authors":["Yinyu Lan","Yanru Wu","Wang Xu","Weiqiang Feng","Youhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.14096v4.pdf","comment":"FinLLM Symposium at IJCAI 2023"},{"id":"http://arxiv.org/abs/2305.01788v3","updated":"2023-07-24T00:54:51Z","published":"2023-05-02T21:33:10Z","title":"Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation\n Incorporating Gloss Information","summary":" Visual Word Sense Disambiguation (VWSD) is a task to find the image that most\naccurately depicts the correct sense of the target word for the given context.\nPreviously, image-text matching models often suffered from recognizing\npolysemous words. This paper introduces an unsupervised VWSD approach that uses\ngloss information of an external lexical knowledge-base, especially the sense\ndefinitions. Specifically, we suggest employing Bayesian inference to\nincorporate the sense definitions when sense information of the answer is not\nprovided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we\npropose a context-aware definition generation with GPT-3. Experimental results\nshow that the VWSD performance significantly increased with our Bayesian\ninference-based approach. In addition, our context-aware definition generation\nachieved prominent performance improvement in OOD examples exhibiting better\nperformance than the existing definition generation method.\n","authors":["Sunjae Kwon","Rishabh Garodia","Minhwa Lee","Zhichao Yang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2305.01788v3.pdf","comment":"ACL 2023, https://aclanthology.org/2023.acl-long.88"},{"id":"http://arxiv.org/abs/2307.02591v2","updated":"2023-07-24T00:47:23Z","published":"2023-07-05T18:41:29Z","title":"ODD: A Benchmark Dataset for the NLP-based Opioid Related Aberrant\n Behavior Detection","summary":" Opioid related aberrant behaviors (ORAB) present novel risk factors for\nopioid overdose. Previously, ORAB have been mainly assessed by survey results\nand by monitoring drug administrations. Such methods however, cannot scale up\nand do not cover the entire spectrum of aberrant behaviors. On the other hand,\nORAB are widely documented in electronic health record notes. This paper\nintroduces a novel biomedical natural language processing benchmark dataset\nnamed ODD, for ORAB Detection Dataset. ODD is an expert-annotated dataset\ncomprising of more than 750 publicly available EHR notes. ODD has been designed\nto identify ORAB from patients' EHR notes and classify them into nine\ncategories; 1) Confirmed Aberrant Behavior, 2) Suggested Aberrant Behavior, 3)\nOpioids, 4) Indication, 5) Diagnosed opioid dependency, 6) Benzodiapines, 7)\nMedication Changes, 8) Central Nervous System-related, and 9) Social\nDeterminants of Health. We explored two state-of-the-art natural language\nprocessing (NLP) models (finetuning pretrained language models and\nprompt-tuning approaches) to identify ORAB. Experimental results show that the\nprompt-tuning models outperformed the finetuning models in most cateogories and\nthe gains were especially higher among uncommon categories (Suggested aberrant\nbehavior, Diagnosed opioid dependency and Medication change). Although the best\nmodel achieved the highest 83.92% on area under precision recall curve,\nuncommon classes (Suggested Aberrant Behavior, Diagnosed Opioid Dependence, and\nMedication Change) still have a large room for performance improvement.\n","authors":["Sunjae Kwon","Xun Wang","Weisong Liu","Emily Druhl","Minhee L. Sung","Joel I. Reisman","Wenjun Li","Robert D. Kerns","William Becker","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.02591v2.pdf","comment":"Under review"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":" Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2209.05407v3","updated":"2023-07-24T17:58:31Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":" Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen unknown categories into instances, without any prior knowledge about\nthem, while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12980v1","updated":"2023-07-24T17:58:06Z","published":"2023-07-24T17:58:06Z","title":"A Systematic Survey of Prompt Engineering on Vision-Language Foundation\n Models","summary":" Prompt engineering is a technique that involves augmenting a large\npre-trained model with task-specific hints, known as prompts, to adapt the\nmodel to new tasks. Prompts can be created manually as natural language\ninstructions or generated automatically as either natural language instructions\nor vector representations. Prompt engineering enables the ability to perform\npredictions based solely on prompts without updating model parameters, and the\neasier application of large pre-trained models in real-world tasks. In past\nyears, Prompt engineering has been well-studied in natural language processing.\nRecently, it has also been intensively studied in vision-language modeling.\nHowever, there is currently a lack of a systematic overview of prompt\nengineering on pre-trained vision-language models. This paper aims to provide a\ncomprehensive survey of cutting-edge research in prompt engineering on three\ntypes of vision-language models: multimodal-to-text generation models (e.g.\nFlamingo), image-text matching models (e.g. CLIP), and text-to-image generation\nmodels (e.g. Stable Diffusion). For each type of model, a brief model summary,\nprompting methods, prompting-based applications, and the corresponding\nresponsibility and integrity issues are summarized and discussed. Furthermore,\nthe commonalities and differences between prompting on vision-language models,\nlanguage models, and vision models are also discussed. The challenges, future\ndirections, and research opportunities are summarized to foster future research\non this topic.\n","authors":["Jindong Gu","Zhen Han","Shuo Chen","Ahmad Beirami","Bailan He","Gengyuan Zhang","Ruotong Liao","Yao Qin","Volker Tresp","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2307.12980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12972v1","updated":"2023-07-24T17:49:11Z","published":"2023-07-24T17:49:11Z","title":"DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting","summary":" In this paper, we propose a new operator, called 3D DeFormable Attention\n(DFA3D), for 2D-to-3D feature lifting, which transforms multi-view 2D image\nfeatures into a unified 3D space for 3D object detection. Existing feature\nlifting approaches, such as Lift-Splat-based and 2D attention-based, either use\nestimated depth to get pseudo LiDAR features and then splat them to a 3D space,\nwhich is a one-pass operation without feature refinement, or ignore depth and\nlift features by 2D attention mechanisms, which achieve finer semantics while\nsuffering from a depth ambiguity problem. In contrast, our DFA3D-based method\nfirst leverages the estimated depth to expand each view's 2D feature map to 3D\nand then utilizes DFA3D to aggregate features from the expanded 3D feature\nmaps. With the help of DFA3D, the depth ambiguity problem can be effectively\nalleviated from the root, and the lifted features can be progressively refined\nlayer by layer, thanks to the Transformer-like architecture. In addition, we\npropose a mathematically equivalent implementation of DFA3D which can\nsignificantly improve its memory efficiency and computational speed. We\nintegrate DFA3D into several methods that use 2D attention-based feature\nlifting with only a few modifications in code and evaluate on the nuScenes\ndataset. The experiment results show a consistent improvement of +1.41\\% mAP on\naverage, and up to +15.1\\% mAP improvement when high-quality depth information\nis available, demonstrating the superiority, applicability, and huge potential\nof DFA3D. The code is available at\nhttps://github.com/IDEA-Research/3D-deformable-attention.git.\n","authors":["Hongyang Li","Hao Zhang","Zhaoyang Zeng","Shilong Liu","Feng Li","Tianhe Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12970v1","updated":"2023-07-24T17:49:04Z","published":"2023-07-24T17:49:04Z","title":"Volcanic ash delimitation using Artificial Intelligence based on Pix2Pix","summary":" Volcanic eruptions emit ash that can be harmful to human health and cause\ndamage to infrastructure, economic activities and the environment. The\ndelimitation of ash clouds allows to know their behavior and dispersion, which\nhelps in the prevention and mitigation of this phenomenon. Traditional methods\ntake advantage of specialized software programs to process the bands or\nchannels that compose the satellite images. However, their use is limited to\nexperts and demands a lot of time and significant computational resources. In\nrecent years, Artificial Intelligence has been a milestone in the computational\ntreatment of complex problems in different areas. In particular, Deep Learning\ntechniques allow automatic, fast and accurate processing of digital images. The\npresent work proposes the use of the Pix2Pix model, a type of generative\nadversarial network that, once trained, learns the mapping of input images to\noutput images. The architecture of such a network consisting of a generator and\na discriminator provides the versatility needed to produce black and white ash\ncloud images from multispectral satellite images. The evaluation of the model,\nbased on loss and accuracy plots, a confusion matrix, and visual inspection,\nindicates a satisfactory solution for accurate ash cloud delineation,\napplicable in any area of the world and becomes a useful tool in risk\nmanagement.\n","authors":["Christian Carrillo","Gissela Torres","Christian Mejia-Escobar"],"pdf_url":"https://arxiv.org/pdf/2307.12970v1.pdf","comment":"18 pages, in Spanish language, 15 figures"},{"id":"http://arxiv.org/abs/2307.12967v1","updated":"2023-07-24T17:45:40Z","published":"2023-07-24T17:45:40Z","title":"Learning Dense Correspondences between Photos and Sketches","summary":" Humans effortlessly grasp the connection between sketches and real-world\nobjects, even when these sketches are far from realistic. Moreover, human\nsketch understanding goes beyond categorization -- critically, it also entails\nunderstanding how individual elements within a sketch correspond to parts of\nthe physical world it represents. What are the computational ingredients needed\nto support this ability? Towards answering this question, we make two\ncontributions: first, we introduce a new sketch-photo correspondence benchmark,\n$\\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across\n125 object categories, augmenting the existing Sketchy dataset with\nfine-grained correspondence metadata. Second, we propose a self-supervised\nmethod for learning dense correspondences between sketch-photo pairs, building\nupon recent advances in correspondence learning for pairs of photos. Our model\nuses a spatial transformer network to estimate the warp flow between latent\nrepresentations of a sketch and photo extracted by a contrastive learning-based\nConvNet backbone. We found that this approach outperformed several strong\nbaselines and produced predictions that were quantitatively consistent with\nother warp-based methods. However, our benchmark also revealed systematic\ndifferences between predictions of the suite of models we tested and those of\nhumans. Taken together, our work suggests a promising path towards developing\nartificial systems that achieve more human-like understanding of visual images\nat different levels of abstraction. Project page:\nhttps://photo-sketch-correspondence.github.io\n","authors":["Xuanchen Lu","Xiaolong Wang","Judith E Fan"],"pdf_url":"https://arxiv.org/pdf/2307.12967v1.pdf","comment":"Accepted to ICML 2023. Project page:\n https://photo-sketch-correspondence.github.io"},{"id":"http://arxiv.org/abs/2307.12964v1","updated":"2023-07-24T17:43:13Z","published":"2023-07-24T17:43:13Z","title":"Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature\n Alignment","summary":" Text-to-video retrieval systems have recently made significant progress by\nutilizing pre-trained models trained on large-scale image-text pairs. However,\nmost of the latest methods primarily focus on the video modality while\ndisregarding the audio signal for this task. Nevertheless, a recent advancement\nby ECLIPSE has improved long-range text-to-video retrieval by developing an\naudiovisual video representation. Nonetheless, the objective of the\ntext-to-video retrieval task is to capture the complementary audio and video\ninformation that is pertinent to the text query rather than simply achieving\nbetter audio and video alignment. To address this issue, we introduce TEFAL, a\nTExt-conditioned Feature ALignment method that produces both audio and video\nrepresentations conditioned on the text query. Instead of using only an\naudiovisual attention block, which could suppress the audio information\nrelevant to the text query, our approach employs two independent cross-modal\nattention blocks that enable the text to attend to the audio and video\nrepresentations separately. Our proposed method's efficacy is demonstrated on\nfour benchmark datasets that include audio: MSR-VTT, LSMDC, VATEX, and\nCharades, and achieves better than state-of-the-art performance consistently\nacross the four datasets. This is attributed to the additional\ntext-query-conditioned audio representation and the complementary information\nit adds to the text-query-conditioned video representation.\n","authors":["Sarah Ibrahimi","Xiaohang Sun","Pichao Wang","Amanmeet Garg","Ashutosh Sanan","Mohamed Omar"],"pdf_url":"https://arxiv.org/pdf/2307.12964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12941v1","updated":"2023-07-24T17:11:39Z","published":"2023-07-24T17:11:39Z","title":"On Privileged and Convergent Bases in Neural Network Representations","summary":" In this study, we investigate whether the representations learned by neural\nnetworks possess a privileged and convergent basis. Specifically, we examine\nthe significance of feature directions represented by individual neurons.\nFirst, we establish that arbitrary rotations of neural representations cannot\nbe inverted (unlike linear networks), indicating that they do not exhibit\ncomplete rotational invariance. Subsequently, we explore the possibility of\nmultiple bases achieving identical performance. To do this, we compare the\nbases of networks trained with the same parameters but with varying random\ninitializations. Our study reveals two findings: (1) Even in wide networks such\nas WideResNets, neural networks do not converge to a unique basis; (2) Basis\ncorrelation increases significantly when a few early layers of the network are\nfrozen identically.\n Furthermore, we analyze Linear Mode Connectivity, which has been studied as a\nmeasure of basis correlation. Our findings give evidence that while Linear Mode\nConnectivity improves with increased network width, this improvement is not due\nto an increase in basis correlation.\n","authors":["Davis Brown","Nikhil Vyas","Yamini Bansal"],"pdf_url":"https://arxiv.org/pdf/2307.12941v1.pdf","comment":"In the Workshop on High-dimensional Learning Dynamics at ICML 2023"},{"id":"http://arxiv.org/abs/2307.12917v1","updated":"2023-07-24T16:18:22Z","published":"2023-07-24T16:18:22Z","title":"Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard\n Skeleton Mining for Unsupervised Person Re-Identification","summary":" With rapid advancements in depth sensors and deep learning, skeleton-based\nperson re-identification (re-ID) models have recently achieved remarkable\nprogress with many advantages. Most existing solutions learn single-level\nskeleton features from body joints with the assumption of equal skeleton\nimportance, while they typically lack the ability to exploit more informative\nskeleton features from various levels such as limb level with more global body\npatterns. The label dependency of these methods also limits their flexibility\nin learning more general skeleton representations. This paper proposes a\ngeneric unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning\n(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with\nunlabeled 3D skeletons. Firstly, we construct hierarchical representations of\nskeletons to model coarse-to-fine body and motion features from the levels of\nbody joints, components, and limbs. Then a hierarchical meta-prototype\ncontrastive learning model is proposed to cluster and contrast the most typical\nskeleton features (\"prototypes\") from different-level skeletons. By converting\noriginal prototypes into meta-prototypes with multiple homogeneous\ntransformations, we induce the model to learn the inherent consistency of\nprototypes to capture more effective skeleton features for person re-ID.\nFurthermore, we devise a hard skeleton mining mechanism to adaptively infer the\ninformative importance of each skeleton, so as to focus on harder skeletons to\nlearn more discriminative skeleton representations. Extensive evaluations on\nfive datasets demonstrate that our approach outperforms a wide variety of\nstate-of-the-art skeleton-based methods. We further show the general\napplicability of our method to cross-view person re-ID and RGB-based scenarios\nwith estimated skeletons.\n","authors":["Haocong Rao","Cyril Leung","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2307.12917v1.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV). Codes\n are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials\n will be included in the published version"},{"id":"http://arxiv.org/abs/2307.12914v1","updated":"2023-07-24T16:13:43Z","published":"2023-07-24T16:13:43Z","title":"Towards a Visual-Language Foundation Model for Computational Pathology","summary":" The accelerated adoption of digital pathology and advances in deep learning\nhave enabled the development of powerful models for various pathology tasks\nacross a diverse array of diseases and patient cohorts. However, model training\nis often difficult due to label scarcity in the medical domain and the model's\nusage is limited by the specific task and disease for which it is trained.\nAdditionally, most models in histopathology leverage only image data, a stark\ncontrast to how humans teach each other and reason about histopathologic\nentities. We introduce CONtrastive learning from Captions for Histopathology\n(CONCH), a visual-language foundation model developed using diverse sources of\nhistopathology images, biomedical text, and notably over 1.17 million\nimage-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13\ndiverse benchmarks, CONCH can be transferred to a wide range of downstream\ntasks involving either or both histopathology images and text, achieving\nstate-of-the-art performance on histology image classification, segmentation,\ncaptioning, text-to-image and image-to-text retrieval. CONCH represents a\nsubstantial leap over concurrent visual-language pretrained systems for\nhistopathology, with the potential to directly facilitate a wide array of\nmachine learning-based workflows requiring minimal or no further supervised\nfine-tuning.\n","authors":["Ming Y. Lu","Bowen Chen","Drew F. K. Williamson","Richard J. Chen","Ivy Liang","Tong Ding","Guillaume Jaume","Igor Odintsov","Andrew Zhang","Long Phi Le","Georg Gerber","Anil V Parwani","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2307.12914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12909v1","updated":"2023-07-24T16:08:32Z","published":"2023-07-24T16:08:32Z","title":"Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields","summary":" Recently, the editing of neural radiance fields (NeRFs) has gained\nconsiderable attention, but most prior works focus on static scenes while\nresearch on the appearance editing of dynamic scenes is relatively lacking. In\nthis paper, we propose a novel framework to edit the local appearance of\ndynamic NeRFs by manipulating pixels in a single frame of training video.\nSpecifically, to locally edit the appearance of dynamic NeRFs while preserving\nunedited regions, we introduce a local surface representation of the edited\nregion, which can be inserted into and rendered along with the original NeRF\nand warped to arbitrary other frames through a learned invertible motion\nrepresentation network. By employing our method, users without professional\nexpertise can easily add desired content to the appearance of a dynamic scene.\nWe extensively evaluate our approach on various scenes and show that our\napproach achieves spatially and temporally consistent editing results. Notably,\nour approach is versatile and applicable to different variants of dynamic NeRF\nrepresentations.\n","authors":["Shangzhan Zhang","Sida Peng","Yinji ShenTu","Qing Shuai","Tianrun Chen","Kaicheng Yu","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12909v1.pdf","comment":"project page: https://dyn-e.github.io/"},{"id":"http://arxiv.org/abs/2307.12907v1","updated":"2023-07-24T16:02:42Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":" Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12900v1","updated":"2023-07-24T15:47:21Z","published":"2023-07-24T15:47:21Z","title":"Automotive Object Detection via Learning Sparse Events by Temporal\n Dynamics of Spiking Neurons","summary":" Event-based sensors, with their high temporal resolution (1us) and dynamical\nrange (120dB), have the potential to be deployed in high-speed platforms such\nas vehicles and drones. However, the highly sparse and fluctuating nature of\nevents poses challenges for conventional object detection techniques based on\nArtificial Neural Networks (ANNs). In contrast, Spiking Neural Networks (SNNs)\nare well-suited for representing event-based data due to their inherent\ntemporal dynamics. In particular, we demonstrate that the membrane potential\ndynamics can modulate network activity upon fluctuating events and strengthen\nfeatures of sparse input. In addition, the spike-triggered adaptive threshold\ncan stabilize training which further improves network performance. Based on\nthis, we develop an efficient spiking feature pyramid network for event-based\nobject detection. Our proposed SNN outperforms previous SNNs and sophisticated\nANNs with attention mechanisms, achieving a mean average precision (map50) of\n47.7% on the Gen1 benchmark dataset. This result significantly surpasses the\nprevious best SNN by 9.7% and demonstrates the potential of SNNs for\nevent-based vision. Our model has a concise architecture while maintaining high\naccuracy and much lower computation cost as a result of sparse computation. Our\ncode will be publicly available.\n","authors":["Hu Zhang","Luziwei Leng","Kaiwei Che","Qian Liu","Jie Cheng","Qinghai Guo","Jiangxing Liao","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.12900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12803v3","updated":"2023-07-24T15:27:16Z","published":"2022-01-30T12:53:51Z","title":"Generalizing similarity in noisy setups: the DIBS phenomenon","summary":" This work uncovers an interplay among data density, noise, and the\ngeneralization ability in similarity learning. We consider Siamese Neural\nNetworks (SNNs), which are the basic form of contrastive learning, and explore\ntwo types of noise that can impact SNNs, Pair Label Noise (PLN) and Single\nLabel Noise (SLN). Our investigation reveals that SNNs exhibit double descent\nbehaviour regardless of the training setup and that it is further exacerbated\nby noise. We demonstrate that the density of data pairs is crucial for\ngeneralization. When SNNs are trained on sparse datasets with the same amount\nof PLN or SLN, they exhibit comparable generalization properties. However, when\nusing dense datasets, PLN cases generalize worse than SLN ones in the\noverparametrized region, leading to a phenomenon we call Density-Induced Break\nof Similarity (DIBS). In this regime, PLN similarity violation becomes\nmacroscopical, corrupting the dataset to the point where complete interpolation\ncannot be achieved, regardless of the number of model parameters. Our analysis\nalso delves into the correspondence between online optimization and offline\ngeneralization in similarity learning. The results show that this equivalence\nfails in the presence of label noise in all the scenarios considered.\n","authors":["Nayara Fonseca","Veronica Guidetti"],"pdf_url":"https://arxiv.org/pdf/2201.12803v3.pdf","comment":"v3: version accepted at ECAI 2023 + Supplementary Material"},{"id":"http://arxiv.org/abs/2307.12872v1","updated":"2023-07-24T15:10:22Z","published":"2023-07-24T15:10:22Z","title":"Data-free Black-box Attack based on Diffusion Model","summary":" Since the training data for the target model in a data-free black-box attack\nis not available, most recent schemes utilize GANs to generate data for\ntraining substitute model. However, these GANs-based schemes suffer from low\ntraining efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a data-free black-box attack scheme based\non diffusion model to improve the efficiency and accuracy of substitute\ntraining. Despite the data generated by the diffusion model exhibits high\nquality, it presents diverse domain distributions and contains many samples\nthat do not meet the discriminative criteria of the target model. To further\nfacilitate the diffusion model to generate data suitable for the target model,\nwe propose a Latent Code Augmentation (LCA) method to guide the diffusion model\nin generating data. With the guidance of LCA, the data generated by the\ndiffusion model not only meets the discriminative criteria of the target model\nbut also exhibits high diversity. By utilizing this data, it is possible to\ntrain substitute model that closely resemble the target model more efficiently.\nExtensive experiments demonstrate that our LCA achieves higher attack success\nrates and requires fewer query budgets compared to GANs-based schemes for\ndifferent target models.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12868v1","updated":"2023-07-24T15:06:42Z","published":"2023-07-24T15:06:42Z","title":"Understanding the Latent Space of Diffusion Models through the Lens of\n Riemannian Geometry","summary":" Despite the success of diffusion models (DMs), we still lack a thorough\nunderstanding of their latent space. To understand the latent space\n$\\mathbf{x}_t \\in \\mathcal{X}$, we analyze them from a geometrical perspective.\nSpecifically, we utilize the pullback metric to find the local latent basis in\n$\\mathcal{X}$ and their corresponding local tangent basis in $\\mathcal{H}$, the\nintermediate feature maps of DMs. The discovered latent basis enables\nunsupervised image editing capability through latent space traversal. We\ninvestigate the discovered structure from two perspectives. First, we examine\nhow geometric structure evolves over diffusion timesteps. Through analysis, we\nshow that 1) the model focuses on low-frequency components early in the\ngenerative process and attunes to high-frequency details later; 2) At early\ntimesteps, different samples share similar tangent spaces; and 3) The simpler\ndatasets that DMs trained on, the more consistent the tangent space for each\ntimestep. Second, we investigate how the geometric structure changes based on\ntext conditioning in Stable Diffusion. The results show that 1) similar prompts\nyield comparable tangent spaces; and 2) the model depends less on text\nconditions in later timesteps. To the best of our knowledge, this paper is the\nfirst to present image editing through $\\mathbf{x}$-space traversal and provide\nthorough analyses of the latent structure of DMs.\n","authors":["Yong-Hyun Park","Mingi Kwon","Jaewoong Choi","Junghyo Jo","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2307.12868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09224v2","updated":"2023-07-24T15:05:55Z","published":"2023-06-15T16:03:01Z","title":"Encyclopedic VQA: Visual questions about detailed properties of\n fine-grained categories","summary":" We propose Encyclopedic-VQA, a large scale visual question answering (VQA)\ndataset featuring visual questions about detailed properties of fine-grained\ncategories and instances. It contains 221k unique question+answer pairs each\nmatched with (up to) 5 images, resulting in a total of 1M VQA samples.\nMoreover, our dataset comes with a controlled knowledge base derived from\nWikipedia, marking the evidence to support each answer. Empirically, we show\nthat our dataset poses a hard challenge for large vision+language models as\nthey perform poorly on our dataset: PaLI [14] is state-of-the-art on OK-VQA\n[37], yet it only achieves 13.0% accuracy on our dataset. Moreover, we\nexperimentally show that progress on answering our encyclopedic questions can\nbe achieved by augmenting large models with a mechanism that retrieves relevant\ninformation from the knowledge base. An oracle experiment with perfect\nretrieval achieves 87.0% accuracy on the single-hop portion of our dataset, and\nan automatic retrieval-augmented prototype yields 48.8%. We believe that our\ndataset enables future research on retrieval-augmented vision+language models.\nIt is available at\nhttps://github.com/google-research/google-research/tree/master/encyclopedic_vqa .\n","authors":["Thomas Mensink","Jasper Uijlings","Lluis Castrejon","Arushi Goel","Felipe Cadar","Howard Zhou","Fei Sha","André Araujo","Vittorio Ferrari"],"pdf_url":"https://arxiv.org/pdf/2306.09224v2.pdf","comment":"ICCV'23"},{"id":"http://arxiv.org/abs/2307.12858v1","updated":"2023-07-24T14:57:40Z","published":"2023-07-24T14:57:40Z","title":"Treatment Outcome Prediction for Intracerebral Hemorrhage via Generative\n Prognostic Model with Imaging and Tabular Data","summary":" Intracerebral hemorrhage (ICH) is the second most common and deadliest form\nof stroke. Despite medical advances, predicting treat ment outcomes for ICH\nremains a challenge. This paper proposes a novel prognostic model that utilizes\nboth imaging and tabular data to predict treatment outcome for ICH. Our model\nis trained on observational data collected from non-randomized controlled\ntrials, providing reliable predictions of treatment success. Specifically, we\npropose to employ a variational autoencoder model to generate a low-dimensional\nprognostic score, which can effectively address the selection bias resulting\nfrom the non-randomized controlled trials. Importantly, we develop a\nvariational distributions combination module that combines the information from\nimaging data, non-imaging clinical data, and treatment assignment to accurately\ngenerate the prognostic score. We conducted extensive experiments on a\nreal-world clinical dataset of intracerebral hemorrhage. Our proposed method\ndemonstrates a substantial improvement in treatment outcome prediction compared\nto existing state-of-the-art approaches. Code is available at\nhttps://github.com/med-air/TOP-GPM\n","authors":["Wenao Ma","Cheng Chen","Jill Abrigo","Calvin Hoi-Kwan Mak","Yuqi Gong","Nga Yan Chan","Chu Han","Zaiyi Liu","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2307.12858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12854v1","updated":"2023-07-24T14:55:15Z","published":"2023-07-24T14:55:15Z","title":"Multiscale Video Pretraining for Long-Term Activity Forecasting","summary":" Long-term activity forecasting is an especially challenging research problem\nbecause it requires understanding the temporal relationships between observed\nactions, as well as the variability and complexity of human activities. Despite\nrelying on strong supervision via expensive human annotations, state-of-the-art\nforecasting approaches often generalize poorly to unseen data. To alleviate\nthis issue, we propose Multiscale Video Pretraining (MVP), a novel\nself-supervised pretraining approach that learns robust representations for\nforecasting by learning to predict contextualized representations of future\nvideo clips over multiple timescales. MVP is based on our observation that\nactions in videos have a multiscale nature, where atomic actions typically\noccur at a short timescale and more complex actions may span longer timescales.\nWe compare MVP to state-of-the-art self-supervised video learning approaches on\ndownstream long-term forecasting tasks including long-term action anticipation\nand video summary prediction. Our comprehensive experiments across the Ego4D\nand Epic-Kitchens-55/100 datasets demonstrate that MVP out-performs\nstate-of-the-art methods by significant margins. Notably, MVP obtains a\nrelative performance gain of over 20% accuracy in video summary forecasting\nover existing methods.\n","authors":["Reuben Tan","Matthias De Lange","Michael Iuzzolino","Bryan A. Plummer","Kate Saenko","Karl Ridgeway","Lorenzo Torresani"],"pdf_url":"https://arxiv.org/pdf/2307.12854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11630v3","updated":"2023-07-24T14:53:51Z","published":"2023-03-21T06:54:18Z","title":"BoxSnake: Polygonal Instance Segmentation with Box Supervision","summary":" Box-supervised instance segmentation has gained much attention as it requires\nonly simple box annotations instead of costly mask or polygon annotations.\nHowever, existing box-supervised instance segmentation models mainly focus on\nmask-based frameworks. We propose a new end-to-end training technique, termed\nBoxSnake, to achieve effective polygonal instance segmentation using only box\nannotations for the first time. Our method consists of two loss functions: (1)\na point-based unary loss that constrains the bounding box of predicted polygons\nto achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss\nthat encourages the predicted polygons to fit the object boundaries. Compared\nwith the mask-based weakly-supervised methods, BoxSnake further reduces the\nperformance gap between the predicted segmentation and the bounding box, and\nshows significant superiority on the Cityscapes dataset. The code has been\navailable publicly.\n","authors":["Rui Yang","Lin Song","Yixiao Ge","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2303.11630v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12853v1","updated":"2023-07-24T14:53:23Z","published":"2023-07-24T14:53:23Z","title":"Spatiotemporal Modeling Encounters 3D Medical Image Analysis:\n Slice-Shift UNet with Multi-View Fusion","summary":" As a fundamental part of computational healthcare, Computer Tomography (CT)\nand Magnetic Resonance Imaging (MRI) provide volumetric data, making the\ndevelopment of algorithms for 3D image analysis a necessity. Despite being\ncomputationally cheap, 2D Convolutional Neural Networks can only extract\nspatial information. In contrast, 3D CNNs can extract three-dimensional\nfeatures, but they have higher computational costs and latency, which is a\nlimitation for clinical practice that requires fast and efficient models.\nInspired by the field of video action recognition we propose a new 2D-based\nmodel dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional\nfeatures at 2D CNN's complexity. More precisely multi-view features are\ncollaboratively learned by performing 2D convolutions along the three\northogonal planes of a volume and imposing a weights-sharing mechanism. The\nthird dimension, which is neglected by the 2D convolution, is reincorporated by\nshifting a portion of the feature maps along the slices' axis. The\neffectiveness of our approach is validated in Multi-Modality Abdominal\nMulti-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial\nVault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in\nperformance with state-of-the-art architectures.\n","authors":["C. I. Ugwu","S. Casarin","O. Lanz"],"pdf_url":"https://arxiv.org/pdf/2307.12853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12845v1","updated":"2023-07-24T14:43:07Z","published":"2023-07-24T14:43:07Z","title":"Multi-View Vertebra Localization and Identification from CT Images","summary":" Accurately localizing and identifying vertebrae from CT images is crucial for\nvarious clinical applications. However, most existing efforts are performed on\n3D with cropping patch operation, suffering from the large computation costs\nand limited global information. In this paper, we propose a multi-view vertebra\nlocalization and identification from CT images, converting the 3D problem into\na 2D localization and identification task on different views. Without the\nlimitation of the 3D cropped patch, our method can learn the multi-view global\ninformation naturally. Moreover, to better capture the anatomical structure\ninformation from different view perspectives, a multi-view contrastive learning\nstrategy is developed to pre-train the backbone. Additionally, we further\npropose a Sequence Loss to maintain the sequential structure embedded along the\nvertebrae. Evaluation results demonstrate that, with only two 2D networks, our\nmethod can localize and identify vertebrae in CT images accurately, and\noutperforms the state-of-the-art methods consistently. Our code is available at\nhttps://github.com/ShanghaiTech-IMPACT/Multi-View-Vertebra-Localization-and-Identification-from-CT-Images.\n","authors":["Han Wu","Jiadong Zhang","Yu Fang","Zhentao Liu","Nizhuan Wang","Zhiming Cui","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2307.12845v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.15599v2","updated":"2023-07-24T14:41:40Z","published":"2023-06-27T16:37:37Z","title":"Coupling a Recurrent Neural Network to SPAD TCSPC Systems for Real-time\n Fluorescence Lifetime Imaging","summary":" Fluorescence lifetime imaging (FLI) has been receiving increased attention in\nrecent years as a powerful diagnostic technique in biological and medical\nresearch. However, existing FLI systems often suffer from a tradeoff between\nprocessing speed, accuracy, and robustness. In this paper, we propose a robust\napproach that enables fast FLI with no degradation of accuracy. The approach is\nbased on a SPAD TCSPC system coupled to a recurrent neural network (RNN) that\naccurately estimates the fluorescence lifetime directly from raw timestamps\nwithout building histograms, thereby drastically reducing transfer data volumes\nand hardware resource utilization, thus enabling FLI acquisition at video rate.\nWe train two variants of the RNN on a synthetic dataset and compare the results\nto those obtained using center-of-mass method (CMM) and least squares fitting\n(LS fitting). Results demonstrate that two RNN variants, gated recurrent unit\n(GRU) and long short-term memory (LSTM), are comparable to CMM and LS fitting\nin terms of accuracy, while outperforming them in background noise by a large\nmargin. To explore the ultimate limits of the approach, we derived the\nCramer-Rao lower bound of the measurement, showing that RNN yields lifetime\nestimations with near-optimal precision. Moreover, our FLI model, which is\npurely trained on synthetic datasets, works well with never-seen-before,\nreal-world data. To demonstrate real-time operation, we have built a FLI\nmicroscope based on Piccolo, a 32x32 SPAD sensor developed in our lab. Four\nquantized GRU cores, capable of processing up to 4 million photons per second,\nare deployed on a Xilinx Kintex-7 FPGA. Powered by the GRU, the FLI setup can\nretrieve real-time fluorescence lifetime images at up to 10 frames per second.\nThe proposed FLI system is promising and ideally suited for biomedical\napplications.\n","authors":["Yang Lin","Paul Mos","Andrei Ardelean","Claudio Bruschini","Edoardo Charbon"],"pdf_url":"https://arxiv.org/pdf/2306.15599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09696v2","updated":"2023-07-24T14:36:24Z","published":"2023-07-19T00:41:39Z","title":"Towards Saner Deep Image Registration","summary":" With recent advances in computing hardware and surges of deep-learning\narchitectures, learning-based deep image registration methods have surpassed\ntheir traditional counterparts, in terms of metric performance and inference\ntime. However, these methods focus on improving performance measurements such\nas Dice, resulting in less attention given to model behaviors that are equally\ndesirable for registrations, especially for medical imaging. This paper\ninvestigates these behaviors for popular learning-based deep registrations\nunder a sanity-checking microscope. We find that most existing registrations\nsuffer from low inverse consistency and nondiscrimination of identical pairs\ndue to overly optimized image similarities. To rectify these behaviors, we\npropose a novel regularization-based sanity-enforcer method that imposes two\nsanity checks on the deep model to reduce its inverse consistency errors and\nincrease its discriminative power simultaneously. Moreover, we derive a set of\ntheoretical guarantees for our sanity-checked image registration method, with\nexperimental results supporting our theoretical findings and their\neffectiveness in increasing the sanity of models without sacrificing any\nperformance. Our code and models are available at\nhttps://github.com/tuffr5/Saner-deep-registration.\n","authors":["Bin Duan","Ming Zhong","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2307.09696v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12837v1","updated":"2023-07-24T14:35:46Z","published":"2023-07-24T14:35:46Z","title":"EPIC-KITCHENS-100 Unsupervised Domain Adaptation Challenge: Mixed\n Sequences Prediction","summary":" This report presents the technical details of our approach for the\nEPIC-Kitchens-100 Unsupervised Domain Adaptation (UDA) Challenge in Action\nRecognition. Our approach is based on the idea that the order in which actions\nare performed is similar between the source and target domains. Based on this,\nwe generate a modified sequence by randomly combining actions from the source\nand target domains. As only unlabelled target data are available under the UDA\nsetting, we use a standard pseudo-labeling strategy for extracting action\nlabels for the target. We then ask the network to predict the resulting action\nsequence. This allows to integrate information from both domains during\ntraining and to achieve better transfer results on target. Additionally, to\nbetter incorporate sequence information, we use a language model to filter\nunlikely sequences. Lastly, we employed a co-occurrence matrix to eliminate\nunseen combinations of verbs and nouns. Our submission, labeled as 'sshayan',\ncan be found on the leaderboard, where it currently holds the 2nd position for\n'verb' and the 4th position for both 'noun' and 'action'.\n","authors":["Amirshayan Nasirimajd","Simone Alberto Peirone","Chiara Plizzari","Barbara Caputo"],"pdf_url":"https://arxiv.org/pdf/2307.12837v1.pdf","comment":"2nd place in the 2023 EPIC-KITCHENS-100 Unsupervised Domain\n Adaptation Challenge for Action Recognition"},{"id":"http://arxiv.org/abs/2307.12822v1","updated":"2023-07-24T14:19:36Z","published":"2023-07-24T14:19:36Z","title":"Learning Provably Robust Estimators for Inverse Problems via Jittering","summary":" Deep neural networks provide excellent performance for inverse problems such\nas denoising. However, neural networks can be sensitive to adversarial or\nworst-case perturbations. This raises the question of whether such networks can\nbe trained efficiently to be worst-case robust. In this paper, we investigate\nwhether jittering, a simple regularization technique that adds isotropic\nGaussian noise during training, is effective for learning worst-case robust\nestimators for inverse problems. While well studied for prediction in\nclassification tasks, the effectiveness of jittering for inverse problems has\nnot been systematically investigated. In this paper, we present a novel\nanalytical characterization of the optimal $\\ell_2$-worst-case robust estimator\nfor linear denoising and show that jittering yields optimal robust denoisers.\nFurthermore, we examine jittering empirically via training deep neural networks\n(U-nets) for natural image denoising, deconvolution, and accelerated magnetic\nresonance imaging (MRI). The results show that jittering significantly enhances\nthe worst-case robustness, but can be suboptimal for inverse problems beyond\ndenoising. Moreover, our results imply that training on real data which often\ncontains slight noise is somewhat robustness enhancing.\n","authors":["Anselm Krainovic","Mahdi Soltanolkotabi","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2307.12822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12813v1","updated":"2023-07-24T14:06:54Z","published":"2023-07-24T14:06:54Z","title":"Exposing the Troublemakers in Described Object Detection","summary":" Detecting objects based on language descriptions is a popular task that\nincludes Open-Vocabulary object Detection (OVD) and Referring Expression\nComprehension (REC). In this paper, we advance them to a more practical setting\ncalled Described Object Detection (DOD) by expanding category names to flexible\nlanguage expressions for OVD and overcoming the limitation of REC to only\ngrounding the pre-existing object. We establish the research foundation for DOD\ntasks by constructing a Description Detection Dataset ($D^3$), featuring\nflexible language expressions and annotating all described objects without\nomission. By evaluating previous SOTA methods on $D^3$, we find some\ntroublemakers that fail current REC, OVD, and bi-functional methods. REC\nmethods struggle with confidence scores, rejecting negative instances, and\nmulti-target scenarios, while OVD methods face constraints with long and\ncomplex descriptions. Recent bi-functional methods also do not work well on DOD\ndue to their separated training procedures and inference strategies for REC and\nOVD tasks. Building upon the aforementioned findings, we propose a baseline\nthat largely improves REC methods by reconstructing the training data and\nintroducing a binary classification sub-task, outperforming existing methods.\nData and code is available at https://github.com/shikras/d-cube.\n","authors":["Chi Xie","Zhao Zhang","Yixuan Wu","Feng Zhu","Rui Zhao","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12813v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2307.02148v2","updated":"2023-07-24T13:59:50Z","published":"2023-07-05T09:44:02Z","title":"Compound Attention and Neighbor Matching Network for Multi-contrast MRI\n Super-resolution","summary":" Multi-contrast magnetic resonance imaging (MRI) reflects information about\nhuman tissue from different perspectives and has many clinical applications. By\nutilizing the complementary information among different modalities,\nmulti-contrast super-resolution (SR) of MRI can achieve better results than\nsingle-image super-resolution. However, existing methods of multi-contrast MRI\nSR have the following shortcomings that may limit their performance: First,\nexisting methods either simply concatenate the reference and degraded features\nor exploit global feature-matching between them, which are unsuitable for\nmulti-contrast MRI SR. Second, although many recent methods employ transformers\nto capture long-range dependencies in the spatial dimension, they neglect that\nself-attention in the channel dimension is also important for low-level vision\ntasks. To address these shortcomings, we proposed a novel network architecture\nwith compound-attention and neighbor matching (CANM-Net) for multi-contrast MRI\nSR: The compound self-attention mechanism effectively captures the dependencies\nin both spatial and channel dimension; the neighborhood-based feature-matching\nmodules are exploited to match degraded features and adjacent reference\nfeatures and then fuse them to obtain the high-quality images. We conduct\nexperiments of SR tasks on the IXI, fastMRI, and real-world scanning datasets.\nThe CANM-Net outperforms state-of-the-art approaches in both retrospective and\nprospective experiments. Moreover, the robustness study in our work shows that\nthe CANM-Net still achieves good performance when the reference and degraded\nimages are imperfectly registered, proving good potential in clinical\napplications.\n","authors":["Wenxuan Chen","Sirui Wu","Shuai Wang","Zhongsen Li","Jia Yang","Huifeng Yao","Xiaomeng Li","Xiaolei Song"],"pdf_url":"https://arxiv.org/pdf/2307.02148v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2211.16761v3","updated":"2023-07-24T13:53:26Z","published":"2022-11-30T05:59:23Z","title":"Improving Cross-Modal Retrieval with Set of Diverse Embeddings","summary":" Cross-modal retrieval across image and text modalities is a challenging task\ndue to its inherent ambiguity: An image often exhibits various situations, and\na caption can be coupled with diverse images. Set-based embedding has been\nstudied as a solution to this problem. It seeks to encode a sample into a set\nof different embedding vectors that capture different semantics of the sample.\nIn this paper, we present a novel set-based embedding method, which is distinct\nfrom previous work in two aspects. First, we present a new similarity function\ncalled smooth-Chamfer similarity, which is designed to alleviate the side\neffects of existing similarity functions for set-based embedding. Second, we\npropose a novel set prediction module to produce a set of embedding vectors\nthat effectively captures diverse semantics of input by the slot attention\nmechanism. Our method is evaluated on the COCO and Flickr30K datasets across\ndifferent visual backbones, where it outperforms existing methods including\nones that demand substantially larger computation at inference.\n","authors":["Dongwon Kim","Namyup Kim","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2211.16761v3.pdf","comment":"Accepted to CVPR 2023 (Highlight)"},{"id":"http://arxiv.org/abs/2307.12790v1","updated":"2023-07-24T13:39:21Z","published":"2023-07-24T13:39:21Z","title":"Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution\n for Medical Image Classification","summary":" Graph-based neural network models are gaining traction in the field of\nrepresentation learning due to their ability to uncover latent topological\nrelationships between entities that are otherwise challenging to identify.\nThese models have been employed across a diverse range of domains, encompassing\ndrug discovery, protein interactions, semantic segmentation, and fluid dynamics\nresearch. In this study, we investigate the potential of Graph Neural Networks\n(GNNs) for medical image classification. We introduce a novel model that\ncombines GNNs and edge convolution, leveraging the interconnectedness of RGB\nchannel feature values to strongly represent connections between crucial graph\nnodes. Our proposed model not only performs on par with state-of-the-art Deep\nNeural Networks (DNNs) but does so with 1000 times fewer parameters, resulting\nin reduced training time and data requirements. We compare our Graph\nConvolutional Neural Network (GCNN) to pre-trained DNNs for classifying\nMedMNIST dataset classes, revealing promising prospects for GNNs in medical\nimage analysis. Our results also encourage further exploration of advanced\ngraph-based models such as Graph Attention Networks (GAT) and Graph\nAuto-Encoders in the medical imaging domain. The proposed model yields more\nreliable, interpretable, and accurate outcomes for tasks like semantic\nsegmentation and image classification compared to simpler GCNNs\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2307.12790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13170v4","updated":"2023-07-24T13:35:28Z","published":"2022-04-27T20:04:24Z","title":"AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias\n Estimation","summary":" In Federated Learning (FL), a number of clients or devices collaborate to\ntrain a model without sharing their data. Models are optimized locally at each\nclient and further communicated to a central hub for aggregation. While FL is\nan appealing decentralized training paradigm, heterogeneity among data from\ndifferent clients can cause the local optimization to drift away from the\nglobal objective. In order to estimate and therefore remove this drift,\nvariance reduction techniques have been incorporated into FL optimization\nrecently. However, these approaches inaccurately estimate the clients' drift\nand ultimately fail to remove it properly. In this work, we propose an adaptive\nalgorithm that accurately estimates drift across clients. In comparison to\nprevious works, our approach necessitates less storage and communication\nbandwidth, as well as lower compute costs. Additionally, our proposed\nmethodology induces stability by constraining the norm of estimates for client\ndrift, making it more practical for large scale FL. Experimental findings\ndemonstrate that the proposed algorithm converges significantly faster and\nachieves higher accuracy than the baselines across various FL benchmarks.\n","authors":["Farshid Varno","Marzie Saghayi","Laya Rafiee Sevyeri","Sharut Gupta","Stan Matwin","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2204.13170v4.pdf","comment":"Published as a conference paper at ECCV 2022; Corrected some typos in\n the text and a baseline algorithm"},{"id":"http://arxiv.org/abs/2303.12540v2","updated":"2023-07-24T13:35:16Z","published":"2023-03-22T13:16:37Z","title":"Deployment of Image Analysis Algorithms under Prevalence Shifts","summary":" Domain gaps are among the most relevant roadblocks in the clinical\ntranslation of machine learning (ML)-based solutions for medical image\nanalysis. While current research focuses on new training paradigms and network\narchitectures, little attention is given to the specific effect of prevalence\nshifts on an algorithm deployed in practice. Such discrepancies between class\nfrequencies in the data used for a method's development/validation and that in\nits deployment environment(s) are of great importance, for example in the\ncontext of artificial intelligence (AI) democratization, as disease prevalences\nmay vary widely across time and location. Our contribution is twofold. First,\nwe empirically demonstrate the potentially severe consequences of missing\nprevalence handling by analyzing (i) the extent of miscalibration, (ii) the\ndeviation of the decision threshold from the optimum, and (iii) the ability of\nvalidation metrics to reflect neural network performance on the deployment\npopulation as a function of the discrepancy between development and deployment\nprevalence. Second, we propose a workflow for prevalence-aware image\nclassification that uses estimated deployment prevalences to adjust a trained\nclassifier to a new environment, without requiring additional annotated\ndeployment data. Comprehensive experiments based on a diverse set of 30 medical\nclassification tasks showcase the benefit of the proposed workflow in\ngenerating better classifier decisions and more reliable performance estimates\ncompared to current practice.\n","authors":["Patrick Godau","Piotr Kalinowski","Evangelia Christodoulou","Annika Reinke","Minu Tizabi","Luciana Ferrer","Paul Jäger","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.12540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12775v1","updated":"2023-07-24T13:24:56Z","published":"2023-07-24T13:24:56Z","title":"Is attention all you need in medical image analysis? A review","summary":" Medical imaging is a key component in clinical diagnosis, treatment planning\nand clinical trial design, accounting for almost 90% of all healthcare data.\nCNNs achieved performance gains in medical image analysis (MIA) over the last\nyears. CNNs can efficiently model local pixel interactions and be trained on\nsmall-scale MI data. The main disadvantage of typical CNN models is that they\nignore global pixel relationships within images, which limits their\ngeneralisation ability to understand out-of-distribution data with different\n'global' information. The recent progress of Artificial Intelligence gave rise\nto Transformers, which can learn global relationships from data. However, full\nTransformer models need to be trained on large-scale data and involve\ntremendous computational complexity. Attention and Transformer compartments\n(Transf/Attention) which can well maintain properties for modelling global\nrelationships, have been proposed as lighter alternatives of full Transformers.\nRecently, there is an increasing trend to co-pollinate complementary\nlocal-global properties from CNN and Transf/Attention architectures, which led\nto a new era of hybrid models. The past years have witnessed substantial growth\nin hybrid CNN-Transf/Attention models across diverse MIA problems. In this\nsystematic review, we survey existing hybrid CNN-Transf/Attention models,\nreview and unravel key architectural designs, analyse breakthroughs, and\nevaluate current and future opportunities as well as challenges. We also\nintroduced a comprehensive analysis framework on generalisation opportunities\nof scientific and clinical impact, based on which new data-driven domain\ngeneralisation and adaptation methods can be stimulated.\n","authors":["Giorgos Papanastasiou","Nikolaos Dikaios","Jiahao Huang","Chengjia Wang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12774v1","updated":"2023-07-24T13:24:19Z","published":"2023-07-24T13:24:19Z","title":"Fast Full-frame Video Stabilization with Iterative Optimization","summary":" Video stabilization refers to the problem of transforming a shaky video into\na visually pleasing one. The question of how to strike a good trade-off between\nvisual quality and computational speed has remained one of the open challenges\nin video stabilization. Inspired by the analogy between wobbly frames and\njigsaw puzzles, we propose an iterative optimization-based learning approach\nusing synthetic datasets for video stabilization, which consists of two\ninteracting submodules: motion trajectory smoothing and full-frame outpainting.\nFirst, we develop a two-level (coarse-to-fine) stabilizing algorithm based on\nthe probabilistic flow field. The confidence map associated with the estimated\noptical flow is exploited to guide the search for shared regions through\nbackpropagation. Second, we take a divide-and-conquer approach and propose a\nnovel multiframe fusion strategy to render full-frame stabilized views. An\nimportant new insight brought about by our iterative optimization approach is\nthat the target video can be interpreted as the fixed point of nonlinear\nmapping for video stabilization. We formulate video stabilization as a problem\nof minimizing the amount of jerkiness in motion trajectories, which guarantees\nconvergence with the help of fixed-point theory. Extensive experimental results\nare reported to demonstrate the superiority of the proposed approach in terms\nof computational speed and visual quality. The code will be available on\nGitHub.\n","authors":["Weiyue Zhao","Xin Li","Zhan Peng","Xianrui Luo","Xinyi Ye","Hao Lu","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12774v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.12761v1","updated":"2023-07-24T13:05:36Z","published":"2023-07-24T13:05:36Z","title":"LiDAR Meta Depth Completion","summary":" Depth estimation is one of the essential tasks to be addressed when creating\nmobile autonomous systems. While monocular depth estimation methods have\nimproved in recent times, depth completion provides more accurate and reliable\ndepth maps by additionally using sparse depth information from other sensors\nsuch as LiDAR. However, current methods are specifically trained for a single\nLiDAR sensor. As the scanning pattern differs between sensors, every new sensor\nwould require re-training a specialized depth completion model, which is\ncomputationally inefficient and not flexible. Therefore, we propose to\ndynamically adapt the depth completion model to the used sensor type enabling\nLiDAR adaptive depth completion. Specifically, we propose a meta depth\ncompletion network that uses data patterns derived from the data to learn a\ntask network to alter weights of the main depth completion network to solve a\ngiven depth completion task effectively. The method demonstrates a strong\ncapability to work on multiple LiDAR scanning patterns and can also generalize\nto scanning patterns that are unseen during training. While using a single\nmodel, our method yields significantly better results than a non-adaptive\nbaseline trained on different LiDAR patterns. It outperforms LiDAR-specific\nexpert models for very sparse cases. These advantages allow flexible deployment\nof a single depth completion model on different sensors, which could also prove\nvaluable to process the input of nascent LiDAR technology with adaptive instead\nof fixed scanning patterns.\n","authors":["Wolfgang Boettcher","Lukas Hoyer","Ozan Unal","Dengxin Dai"],"pdf_url":"https://arxiv.org/pdf/2307.12761v1.pdf","comment":"Accepted at IROS 2023"},{"id":"http://arxiv.org/abs/2209.11531v2","updated":"2023-07-24T13:04:48Z","published":"2022-09-23T11:36:32Z","title":"Deep Learning-based Anonymization of Chest Radiographs: A\n Utility-preserving Measure for Patient Privacy","summary":" Robust and reliable anonymization of chest radiographs constitutes an\nessential step before publishing large datasets of such for research purposes.\nThe conventional anonymization process is carried out by obscuring personal\ninformation in the images with black boxes and removing or replacing\nmeta-information. However, such simple measures retain biometric information in\nthe chest radiographs, allowing patients to be re-identified by a linkage\nattack. Therefore, there is an urgent need to obfuscate the biometric\ninformation appearing in the images. We propose the first deep learning-based\napproach (PriCheXy-Net) to targetedly anonymize chest radiographs while\nmaintaining data utility for diagnostic and machine learning purposes. Our\nmodel architecture is a composition of three independent neural networks that,\nwhen collectively used, allow for learning a deformation field that is able to\nimpede patient re-identification. Quantitative results on the ChestX-ray14\ndataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)\nafter re-training with little impact on the abnormality classification\nperformance. This indicates the ability to preserve underlying abnormality\npatterns while increasing patient privacy. Lastly, we compare our proposed\nanonymization approach with two other obfuscation-based methods (Privacy-Net,\nDP-Pix) and demonstrate the superiority of our method towards resolving the\nprivacy-utility trade-off for chest radiographs.\n","authors":["Kai Packhäuser","Sebastian Gündel","Florian Thamm","Felix Denzinger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2209.11531v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.07620v2","updated":"2023-07-24T13:03:17Z","published":"2023-07-14T20:39:07Z","title":"Generalizable Embeddings with Cross-batch Metric Learning","summary":" Global average pooling (GAP) is a popular component in deep metric learning\n(DML) for aggregating features. Its effectiveness is often attributed to\ntreating each feature vector as a distinct semantic entity and GAP as a\ncombination of them. Albeit substantiated, such an explanation's algorithmic\nimplications to learn generalizable entities to represent unseen classes, a\ncrucial DML goal, remain unclear. To address this, we formulate GAP as a convex\ncombination of learnable prototypes. We then show that the prototype learning\ncan be expressed as a recursive process fitting a linear predictor to a batch\nof samples. Building on that perspective, we consider two batches of disjoint\nclasses at each iteration and regularize the learning by expressing the samples\nof a batch with the prototypes that are fitted to the other batch. We validate\nour approach on 4 popular DML benchmarks.\n","authors":["Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2307.07620v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2307.12751v1","updated":"2023-07-24T12:42:45Z","published":"2023-07-24T12:42:45Z","title":"ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised\n Real-world Single Image Super-Resolution","summary":" Single image super-resolution (SISR) is a challenging ill-posed problem that\naims to up-sample a given low-resolution (LR) image to a high-resolution (HR)\ncounterpart. Due to the difficulty in obtaining real LR-HR training pairs,\nrecent approaches are trained on simulated LR images degraded by simplified\ndown-sampling operators, e.g., bicubic. Such an approach can be problematic in\npractice because of the large gap between the synthesized and real-world LR\nimages. To alleviate the issue, we propose a novel Invertible scale-Conditional\nFunction (ICF), which can scale an input image and then restore the original\ninput with different scale conditions. By leveraging the proposed ICF, we\nconstruct a novel self-supervised SISR framework (ICF-SRSR) to handle the\nreal-world SR task without using any paired/unpaired training data.\nFurthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs,\nwhich can make existing supervised SISR networks more robust. Extensive\nexperiments demonstrate the effectiveness of the proposed method in handling\nSISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior\nperformance compared to the existing methods trained on synthetic paired images\nin real-world scenarios and exhibits comparable performance compared to\nstate-of-the-art supervised/unsupervised methods on public benchmark datasets.\n","authors":["Reyhaneh Neshatavar","Mohsen Yavartanoo","Sanghyun Son","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09629v2","updated":"2023-07-24T12:33:09Z","published":"2023-02-19T17:15:56Z","title":"BiofilmScanner: A Computational Intelligence Approach to Obtain\n Bacterial Cell Morphological Attributes from Biofilm Image","summary":" Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for\nsulfate-reducing bacteria (SRB) that are associated with corrosion issues\ncaused by microorganisms. SRB-based biofilms are thought to be responsible for\nthe billion-dollar-per-year bio-corrosion of metal infrastructure.\nUnderstanding the extraction of the bacterial cells' shape and size properties\nin the SRB-biofilm at different growth stages will assist with the design of\nanti-corrosion techniques. However, numerous issues affect current approaches,\nincluding time-consuming geometric property extraction, low efficiency, and\nhigh error rates. This paper proposes BiofilScanner, a Yolact-based deep\nlearning method integrated with invariant moments to address these problems.\nOur approach efficiently detects and segments bacterial cells in an SRB image\nwhile simultaneously invariant moments measure the geometric characteristics of\nthe segmented cells with low errors. The numerical experiments of the proposed\nmethod demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our\nearlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring\nthe geometric properties of the cell. Furthermore, the BiofilmScanner achieved\nan F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%\nand 75.18%, respectively.\n","authors":["Md Hafizur Rahman","Md Ali Azam","Md Abir Hossen","Shankarachary Ragi","Venkataramana Gadhamshetty"],"pdf_url":"https://arxiv.org/pdf/2302.09629v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2307.12732v1","updated":"2023-07-24T12:24:07Z","published":"2023-07-24T12:24:07Z","title":"CLIP-KD: An Empirical Study of Distilling CLIP Models","summary":" CLIP has become a promising language-supervised visual pre-training framework\nand achieves excellent performance over a wide range of tasks. This paper aims\nto distill small CLIP models supervised by a large teacher CLIP model. We\npropose several distillation strategies, including relation, feature, gradient\nand contrastive paradigm, to examine the impact on CLIP distillation. We show\nthat the simplest feature mimicry with MSE loss performs best. Moreover,\ninteractive contrastive learning and relation-based distillation are also\ncritical in performance improvement. We apply the unified method to distill\nseveral student networks trained on 15 million (image, text) pairs.\nDistillation improves the student CLIP models consistently over zero-shot\nImageNet classification and cross-modal retrieval benchmarks. We hope our\nempirical study will become an important baseline for future CLIP distillation\nresearch. The code is available at \\url{https://github.com/winycg/CLIP-KD}.\n","authors":["Chuanguang Yang","Zhulin An","Libo Huang","Junyu Bi","Xinqiang Yu","Han Yang","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12730v1","updated":"2023-07-24T12:22:19Z","published":"2023-07-24T12:22:19Z","title":"COCO-O: A Benchmark for Object Detectors under Natural Distribution\n Shifts","summary":" Practical object detection application can lose its effectiveness on image\ninputs with natural distribution shifts. This problem leads the research\ncommunity to pay more attention on the robustness of detectors under\nOut-Of-Distribution (OOD) inputs. Existing works construct datasets to\nbenchmark the detector's OOD robustness for a specific application scenario,\ne.g., Autonomous Driving. However, these datasets lack universality and are\nhard to benchmark general detectors built on common tasks such as COCO. To give\na more comprehensive robustness assessment, we introduce\nCOCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of\nnatural distribution shifts. COCO-O has a large distribution gap with training\ndata and results in a significant 55.7% relative performance drop on a Faster\nR-CNN detector. We leverage COCO-O to conduct experiments on more than 100\nmodern object detectors to investigate if their improvements are credible or\njust over-fitting to the COCO test set. Unfortunately, most classic detectors\nin early years do not exhibit strong OOD generalization. We further study the\nrobustness effect on recent breakthroughs of detector's architecture design,\naugmentation and pre-training techniques. Some empirical findings are revealed:\n1) Compared with detection head or neck, backbone is the most important part\nfor robustness; 2) An end-to-end detection transformer design brings no\nenhancement, and may even reduce robustness; 3) Large-scale foundation models\nhave made a great leap on robust object detection. We hope our COCO-O could\nprovide a rich testbed for robustness study of object detection. The dataset\nwill be available at\n\\url{https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o}.\n","authors":["Xiaofeng Mao","Yuefeng Chen","Yao Zhu","Da Chen","Hang Su","Rong Zhang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2307.12730v1.pdf","comment":"To appear in ICCV2023,\n https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o"},{"id":"http://arxiv.org/abs/2307.12729v1","updated":"2023-07-24T12:21:33Z","published":"2023-07-24T12:21:33Z","title":"Persistent-Transient Duality: A Multi-mechanism Approach for Modeling\n Human-Object Interaction","summary":" Humans are highly adaptable, swiftly switching between different modes to\nprogressively handle different tasks, situations and contexts. In Human-object\ninteraction (HOI) activities, these modes can be attributed to two mechanisms:\n(1) the large-scale consistent plan for the whole activity and (2) the\nsmall-scale children interactive actions that start and end along the timeline.\nWhile neuroscience and cognitive science have confirmed this multi-mechanism\nnature of human behavior, machine modeling approaches for human motion are\ntrailing behind. While attempted to use gradually morphing structures (e.g.,\ngraph attention networks) to model the dynamic HOI patterns, they miss the\nexpeditious and discrete mode-switching nature of the human motion. To bridge\nthat gap, this work proposes to model two concurrent mechanisms that jointly\ncontrol human motion: the Persistent process that runs continually on the\nglobal scale, and the Transient sub-processes that operate intermittently on\nthe local context of the human while interacting with objects. These two\nmechanisms form an interactive Persistent-Transient Duality that\nsynergistically governs the activity sequences. We model this conceptual\nduality by a parent-child neural network of Persistent and Transient channels\nwith a dedicated neural module for dynamic mechanism switching. The framework\nis trialed on HOI motion forecasting. On two rich datasets and a wide variety\nof settings, the model consistently delivers superior performances, proving its\nsuitability for the challenge.\n","authors":["Hung Tran","Vuong Le","Svetha Venkatesh","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2307.12729v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.12865v3","updated":"2023-07-24T12:08:50Z","published":"2023-03-22T18:59:48Z","title":"NeRF-GAN Distillation for Efficient 3D-Aware Generation with\n Convolutions","summary":" Pose-conditioned convolutional generative models struggle with high-quality\n3D-consistent image generation from single-view datasets, due to their lack of\nsufficient 3D priors. Recently, the integration of Neural Radiance Fields\n(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),\nhas transformed 3D-aware generation from single-view images. NeRF-GANs exploit\nthe strong inductive bias of neural 3D representations and volumetric rendering\nat the cost of higher computational complexity. This study aims at revisiting\npose-conditioned 2D GANs for efficient 3D-aware generation at inference time by\ndistilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and\neffective method, based on re-using the well-disentangled latent space of a\npre-trained NeRF-GAN in a pose-conditioned convolutional network to directly\ngenerate 3D-consistent images corresponding to the underlying 3D\nrepresentations. Experiments on several datasets demonstrate that the proposed\nmethod obtains results comparable with volumetric rendering in terms of quality\nand 3D consistency while benefiting from the computational advantage of\nconvolutional networks. The code will be available at:\nhttps://github.com/mshahbazi72/NeRF-GAN-Distillation\n","authors":["Mohamad Shahbazi","Evangelos Ntavelis","Alessio Tonioni","Edo Collins","Danda Pani Paudel","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12721v1","updated":"2023-07-24T12:03:50Z","published":"2023-07-24T12:03:50Z","title":"AMAE: Adaptation of Pre-Trained Masked Autoencoder for Dual-Distribution\n Anomaly Detection in Chest X-Rays","summary":" Unsupervised anomaly detection in medical images such as chest radiographs is\nstepping into the spotlight as it mitigates the scarcity of the labor-intensive\nand costly expert annotation of anomaly data. However, nearly all existing\nmethods are formulated as a one-class classification trained only on\nrepresentations from the normal class and discard a potentially significant\nportion of the unlabeled data. This paper focuses on a more practical setting,\ndual distribution anomaly detection for chest X-rays, using the entire training\ndata, including both normal and unlabeled images. Inspired by a modern\nself-supervised vision transformer model trained using partial image inputs to\nreconstruct missing image regions -- we propose AMAE, a two-stage algorithm for\nadaptation of the pre-trained masked autoencoder (MAE). Starting from MAE\ninitialization, AMAE first creates synthetic anomalies from only normal\ntraining images and trains a lightweight classifier on frozen transformer\nfeatures. Subsequently, we propose an adaptation strategy to leverage unlabeled\nimages containing anomalies. The adaptation scheme is accomplished by assigning\npseudo-labels to unlabeled images and using two separate MAE based modules to\nmodel the normative and anomalous distributions of pseudo-labeled images. The\neffectiveness of the proposed adaptation strategy is evaluated with different\nanomaly ratios in an unlabeled training set. AMAE leads to consistent\nperformance gains over competing self-supervised and dual distribution anomaly\ndetection methods, setting the new state-of-the-art on three public chest X-ray\nbenchmarks: RSNA, NIH-CXR, and VinDr-CXR.\n","authors":["Behzad Bozorgtabar","Dwarikanath Mahapatra","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.12721v1.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.12718v1","updated":"2023-07-24T11:59:07Z","published":"2023-07-24T11:59:07Z","title":"CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle\n Components","summary":" Neural Radiance Fields (NeRFs) have gained widespread recognition as a highly\neffective technique for representing 3D reconstructions of objects and scenes\nderived from sets of images. Despite their efficiency, NeRF models can pose\nchallenges in certain scenarios such as vehicle inspection, where the lack of\nsufficient data or the presence of challenging elements (e.g. reflections)\nstrongly impact the accuracy of the reconstruction. To this aim, we introduce\nCarPatch, a novel synthetic benchmark of vehicles. In addition to a set of\nimages annotated with their intrinsic and extrinsic camera parameters, the\ncorresponding depth maps and semantic segmentation masks have been generated\nfor each view. Global and part-based metrics have been defined and used to\nevaluate, compare, and better characterize some state-of-the-art techniques.\nThe dataset is publicly released at\nhttps://aimagelab.ing.unimore.it/go/carpatch and can be used as an evaluation\nguide and as a baseline for future work on this challenging topic.\n","authors":["Davide Di Nucci","Alessandro Simoni","Matteo Tomei","Luca Ciuffreda","Roberto Vezzani","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2307.12718v1.pdf","comment":"Accepted at ICIAP2023"},{"id":"http://arxiv.org/abs/2307.12717v1","updated":"2023-07-24T11:58:58Z","published":"2023-07-24T11:58:58Z","title":"Dense Transformer based Enhanced Coding Network for Unsupervised Metal\n Artifact Reduction","summary":" CT images corrupted by metal artifacts have serious negative effects on\nclinical diagnosis. Considering the difficulty of collecting paired data with\nground truth in clinical settings, unsupervised methods for metal artifact\nreduction are of high interest. However, it is difficult for previous\nunsupervised methods to retain structural information from CT images while\nhandling the non-local characteristics of metal artifacts. To address these\nchallenges, we proposed a novel Dense Transformer based Enhanced Coding Network\n(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we\nintroduce a Hierarchical Disentangling Encoder, supported by the high-order\ndense process, and transformer to obtain densely encoded sequences with\nlong-range correspondence. Then, we present a second-order disentanglement\nmethod to improve the dense sequence's decoding process. Extensive experiments\nand model discussions illustrate DTEC-Net's effectiveness, which outperforms\nthe previous state-of-the-art methods on a benchmark dataset, and greatly\nreduces metal artifacts while restoring richer texture details.\n","authors":["Wangduo Xie","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2307.12717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09340v3","updated":"2023-07-24T11:34:21Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":" Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v3.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2011.09094v3","updated":"2023-07-24T11:28:46Z","published":"2020-11-18T05:16:11Z","title":"UP-DETR: Unsupervised Pre-training for Object Detection with\n Transformers","summary":" DEtection TRansformer (DETR) for object detection reaches competitive\nperformance compared with Faster R-CNN via a transformer encoder-decoder\narchitecture. However, trained with scratch transformers, DETR needs\nlarge-scale training data and an extreme long training schedule even on COCO\ndataset. Inspired by the great success of pre-training transformers in natural\nlanguage processing, we propose a novel pretext task named random query patch\ndetection in Unsupervised Pre-training DETR (UP-DETR). Specifically, we\nrandomly crop patches from the given image and then feed them as queries to the\ndecoder. The model is pre-trained to detect these query patches from the input\nimage. During the pre-training, we address two critical issues: multi-task\nlearning and multi-query localization. (1) To trade off classification and\nlocalization preferences in the pretext task, we find that freezing the CNN\nbackbone is the prerequisite for the success of pre-training transformers. (2)\nTo perform multi-query localization, we develop UP-DETR with multi-query patch\ndetection with attention mask. Besides, UP-DETR also provides a unified\nperspective for fine-tuning object detection and one-shot detection tasks. In\nour experiments, UP-DETR significantly boosts the performance of DETR with\nfaster convergence and higher average precision on object detection, one-shot\ndetection and panoptic segmentation. Code and pre-training models:\nhttps://github.com/dddzg/up-detr.\n","authors":["Zhigang Dai","Bolun Cai","Yugeng Lin","Junying Chen"],"pdf_url":"https://arxiv.org/pdf/2011.09094v3.pdf","comment":"Accepted by TPAMI 2022 and CVPR 2021"},{"id":"http://arxiv.org/abs/2307.12698v1","updated":"2023-07-24T11:27:14Z","published":"2023-07-24T11:27:14Z","title":"MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised\n Learning of Motion and Content Features","summary":" Self-supervised learning of visual representations has been focusing on\nlearning content features, which do not capture object motion or location, and\nfocus on identifying and differentiating objects in images and videos. On the\nother hand, optical flow estimation is a task that does not involve\nunderstanding the content of the images on which it is estimated. We unify the\ntwo approaches and introduce MC-JEPA, a joint-embedding predictive architecture\nand self-supervised learning approach to jointly learn optical flow and content\nfeatures within a shared encoder, demonstrating that the two associated\nobjectives; the optical flow estimation objective and the self-supervised\nlearning objective; benefit from each other and thus learn content features\nthat incorporate motion information. The proposed approach achieves performance\non-par with existing unsupervised optical flow benchmarks, as well as with\ncommon self-supervised learning approaches on downstream tasks such as semantic\nsegmentation of images and videos.\n","authors":["Adrien Bardes","Jean Ponce","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2307.12698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10763v3","updated":"2023-07-24T11:15:47Z","published":"2023-02-12T12:19:57Z","title":"Contrastive Learning and the Emergence of Attributes Associations","summary":" In response to an object presentation, supervised learning schemes generally\nrespond with a parsimonious label. Upon a similar presentation we humans\nrespond again with a label, but are flooded, in addition, by a myriad of\nassociations. A significant portion of these consist of the presented object\nattributes. Contrastive learning is a semi-supervised learning scheme based on\nthe application of identity preserving transformations on the object input\nrepresentations. It is conjectured in this work that these same applied\ntransformations preserve, in addition to the identity of the presented object,\nalso the identity of its semantically meaningful attributes. The corollary of\nthis is that the output representations of such a contrastive learning scheme\ncontain valuable information not only for the classification of the presented\nobject, but also for the presence or absence decision of any attribute of\ninterest. Simulation results which demonstrate this idea and the feasibility of\nthis conjecture are presented.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2302.10763v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2304.02941v2","updated":"2023-07-24T10:57:15Z","published":"2023-04-06T08:56:18Z","title":"Dr. KID: Direct Remeshing and K-set Isometric Decomposition for Scalable\n Physicalization of Organic Shapes","summary":" Dr. KID is an algorithm that uses isometric decomposition for the\nphysicalization of potato-shaped organic models in a puzzle fashion. The\nalgorithm begins with creating a simple, regular triangular surface mesh of\norganic shapes, followed by iterative k-means clustering and remeshing. For\nclustering, we need similarity between triangles (segments) which is defined as\na distance function. The distance function maps each triangle's shape to a\nsingle point in the virtual 3D space. Thus, the distance between the triangles\nindicates their degree of dissimilarity. K-means clustering uses this distance\nand sorts of segments into k classes. After this, remeshing is applied to\nminimize the distance between triangles within the same cluster by making their\nshapes identical. Clustering and remeshing are repeated until the distance\nbetween triangles in the same cluster reaches an acceptable threshold. We adopt\na curvature-aware strategy to determine the surface thickness and finalize\npuzzle pieces for 3D printing. Identical hinges and holes are created for\nassembling the puzzle components. For smoother outcomes, we use triangle\nsubdivision along with curvature-aware clustering, generating curved triangular\npatches for 3D printing. Our algorithm was evaluated using various models, and\nthe 3D-printed results were analyzed. Findings indicate that our algorithm\nperforms reliably on target organic shapes with minimal loss of input geometry.\n","authors":["Dawar Khan","Ciril Bohak","Ivan Viola"],"pdf_url":"https://arxiv.org/pdf/2304.02941v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12676v1","updated":"2023-07-24T10:30:54Z","published":"2023-07-24T10:30:54Z","title":"Damage Vision Mining Opportunity for Imbalanced Anomaly Detection","summary":" In past decade, previous balanced datasets have been used to advance\nalgorithms for classification, object detection, semantic segmentation, and\nanomaly detection in industrial applications. Specifically, for condition-based\nmaintenance, automating visual inspection is crucial to ensure high quality.\nDeterioration prognostic attempts to optimize the fine decision process for\npredictive maintenance and proactive repair. In civil infrastructure and living\nenvironment, damage data mining cannot avoid the imbalanced data issue because\nof rare unseen events and high quality status by improved operations. For\nvisual inspection, deteriorated class acquired from the surface of concrete and\nsteel components are occasionally imbalanced. From numerous related surveys, we\nsummarize that imbalanced data problems can be categorized into four types; 1)\nmissing range of target and label valuables, 2) majority-minority class\nimbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class\nof pixel-wise imbalance. Since 2015, there has been many imbalanced studies\nusing deep learning approaches that includes regression, image classification,\nobject detection, semantic segmentation. However, anomaly detection for\nimbalanced data is not yet well known. In the study, we highlight one-class\nanomaly detection application whether anomalous class or not, and demonstrate\nclear examples on imbalanced vision datasets: wooden, concrete deterioration,\nand disaster damage. We provide key results on damage vision mining advantage,\nhypothesizing that the more effective range of positive ratio, the higher\naccuracy gain of anomaly detection application. Finally, the applicability of\nthe damage learning methods, limitations, and future works are mentioned.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v1.pdf","comment":"12 pages, 14 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.12674v1","updated":"2023-07-24T10:24:13Z","published":"2023-07-24T10:24:13Z","title":"Industrial Segment Anything -- a Case Study in Aircraft Manufacturing,\n Intralogistics, Maintenance, Repair, and Overhaul","summary":" Deploying deep learning-based applications in specialized domains like the\naircraft production industry typically suffers from the training data\navailability problem. Only a few datasets represent non-everyday objects,\nsituations, and tasks. Recent advantages in research around Vision Foundation\nModels (VFM) opened a new area of tasks and models with high generalization\ncapabilities in non-semantic and semantic predictions. As recently demonstrated\nby the Segment Anything Project, exploiting VFM's zero-shot capabilities is a\npromising direction in tackling the boundaries spanned by data, context, and\nsensor variety. Although, investigating its application within specific domains\nis subject to ongoing research. This paper contributes here by surveying\napplications of the SAM in aircraft production-specific use cases. We include\nmanufacturing, intralogistics, as well as maintenance, repair, and overhaul\nprocesses, also representing a variety of other neighboring industrial domains.\nBesides presenting the various use cases, we further discuss the injection of\ndomain knowledge.\n","authors":["Keno Moenck","Arne Wendt","Philipp Prünte","Julian Koch","Arne Sahrhage","Johann Gierecker","Ole Schmedemann","Falko Kähler","Dirk Holst","Martin Gomse","Thorsten Schüppstuhl","Daniel Schoepflin"],"pdf_url":"https://arxiv.org/pdf/2307.12674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12672v1","updated":"2023-07-24T10:20:14Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n Image Modeling","summary":" In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07250v2","updated":"2023-07-24T10:10:25Z","published":"2023-04-14T16:58:23Z","title":"Fusing Structure from Motion and Simulation-Augmented Pose Regression\n from Optical Flow for Challenging Indoor Environments","summary":" The localization of objects is a crucial task in various applications such as\nrobotics, virtual and augmented reality, and the transportation of goods in\nwarehouses. Recent advances in deep learning have enabled the localization\nusing monocular visual cameras. While structure from motion (SfM) predicts the\nabsolute pose from a point cloud, absolute pose regression (APR) methods learn\na semantic understanding of the environment through neural networks. However,\nboth fields face challenges caused by the environment such as motion blur,\nlighting changes, repetitive patterns, and feature-less structures. This study\naims to address these challenges by incorporating additional information and\nregularizing the absolute pose using relative pose regression (RPR) methods.\nRPR methods suffer under different challenges, i.e., motion blur. The optical\nflow between consecutive images is computed using the Lucas-Kanade algorithm,\nand the relative pose is predicted using an auxiliary small recurrent\nconvolutional network. The fusion of absolute and relative poses is a complex\ntask due to the mismatch between the global and local coordinate systems.\nState-of-the-art methods fusing absolute and relative poses use pose graph\noptimization (PGO) to regularize the absolute pose predictions using relative\nposes. In this work, we propose recurrent fusion networks to optimally align\nabsolute and relative pose predictions to improve the absolute pose prediction.\nWe evaluate eight different recurrent units and construct a simulation\nenvironment to pre-train the APR and RPR networks for better generalized\ntraining. Additionally, we record a large database of different scenarios in a\nchallenging large-scale indoor environment that mimics a warehouse with\ntransportation robots. We conduct hyperparameter searches and experiments to\nshow the effectiveness of our recurrent fusion method compared to PGO.\n","authors":["Felix Ott","Lucas Heublein","David Rügamer","Bernd Bischl","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2304.07250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12656v1","updated":"2023-07-24T09:54:49Z","published":"2023-07-24T09:54:49Z","title":"A Theoretically Guaranteed Quaternion Weighted Schatten p-norm\n Minimization Method for Color Image Restoration","summary":" Inspired by the fact that the matrix formulated by nonlocal similar patches\nin a natural image is of low rank, the rank approximation issue have been\nextensively investigated over the past decades, among which weighted nuclear\nnorm minimization (WNNM) and weighted Schatten $p$-norm minimization (WSNM) are\ntwo prevailing methods have shown great superiority in various image\nrestoration (IR) problems. Due to the physical characteristic of color images,\ncolor image restoration (CIR) is often a much more difficult task than its\ngrayscale image counterpart. However, when applied to CIR, the traditional\nWNNM/WSNM method only processes three color channels individually and fails to\nconsider their cross-channel correlations. Very recently, a quaternion-based\nWNNM approach (QWNNM) has been developed to mitigate this issue, which is\ncapable of representing the color image as a whole in the quaternion domain and\npreserving the inherent correlation among the three color channels. Despite its\nempirical success, unfortunately, the convergence behavior of QWNNM has not\nbeen strictly studied yet. In this paper, on the one side, we extend the WSNM\ninto quaternion domain and correspondingly propose a novel quaternion-based\nWSNM model (QWSNM) for tackling the CIR problems. Extensive experiments on two\nrepresentative CIR tasks, including color image denoising and deblurring,\ndemonstrate that the proposed QWSNM method performs favorably against many\nstate-of-the-art alternatives, in both quantitative and qualitative\nevaluations. On the other side, more importantly, we preliminarily provide a\ntheoretical convergence analysis, that is, by modifying the quaternion\nalternating direction method of multipliers (QADMM) through a simple\ncontinuation strategy, we theoretically prove that both the solution sequences\ngenerated by the QWNNM and QWSNM have fixed-point convergence guarantees.\n","authors":["Qing-Hua Zhang","Liang-Tian He","Yi-Lun Wang","Liang-Jian Deng","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12656v1.pdf","comment":"46 pages, 10 figures; references added"},{"id":"http://arxiv.org/abs/2302.01162v5","updated":"2023-07-24T09:41:07Z","published":"2023-02-02T15:37:46Z","title":"Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model using\n Pixel-aligned Reconstruction Priors","summary":" Fast generation of high-quality 3D digital humans is important to a vast\nnumber of applications ranging from entertainment to professional concerns.\nRecent advances in differentiable rendering have enabled the training of 3D\ngenerative models without requiring 3D ground truths. However, the quality of\nthe generated 3D humans still has much room to improve in terms of both\nfidelity and diversity. In this paper, we present Get3DHuman, a novel 3D human\nframework that can significantly boost the realism and diversity of the\ngenerated outcomes by only using a limited budget of 3D ground-truth data. Our\nkey observation is that the 3D generator can profit from human-related priors\nlearned through 2D human generators and 3D reconstructors. Specifically, we\nbridge the latent space of Get3DHuman with that of StyleGAN-Human via a\nspecially-designed prior network, where the input latent code is mapped to the\nshape and texture feature volumes spanned by the pixel-aligned 3D\nreconstructor. The outcomes of the prior network are then leveraged as the\nsupervisory signals for the main generator network. To ensure effective\ntraining, we further propose three tailored losses applied to the generated\nfeature volumes and the intermediate feature maps. Extensive experiments\ndemonstrate that Get3DHuman greatly outperforms the other state-of-the-art\napproaches and can support a wide range of applications including shape\ninterpolation, shape re-texturing, and single-view reconstruction through\nlatent inversion.\n","authors":["Zhangyang Xiong","Di Kang","Derong Jin","Weikai Chen","Linchao Bao","Shuguang Cui","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2302.01162v5.pdf","comment":"ICCV 2023, project page:\n https://x-zhangyang.github.io/2023_Get3DHuman/"},{"id":"http://arxiv.org/abs/2307.12644v1","updated":"2023-07-24T09:35:47Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n of rPPG","summary":" Remote Photoplethysmography (rPPG) is a technology that utilizes the light\nabsorption properties of hemoglobin, captured via camera, to analyze and\nmeasure blood volume pulse (BVP). By analyzing the measured BVP, various\nphysiological signals such as heart rate, stress levels, and blood pressure can\nbe derived, enabling applications such as the early prediction of\ncardiovascular diseases. rPPG is a rapidly evolving field as it allows the\nmeasurement of vital signals using camera-equipped devices without the need for\nadditional devices such as blood pressure monitors or pulse oximeters, and\nwithout the assistance of medical experts. Despite extensive efforts and\nadvances in this field, serious challenges remain, including issues related to\nskin color, camera characteristics, ambient lighting, and other sources of\nnoise, which degrade performance accuracy. We argue that fair and evaluable\nbenchmarking is urgently required to overcome these challenges and make any\nmeaningful progress from both academic and commercial perspectives. In most\nexisting work, models are trained, tested, and validated only on limited\ndatasets. Worse still, some studies lack available code or reproducibility,\nmaking it difficult to fairly evaluate and compare performance. Therefore, the\npurpose of this study is to provide a benchmarking framework to evaluate\nvarious rPPG techniques across a wide range of datasets for fair evaluation and\ncomparison, including both conventional non-deep neural network (non-DNN) and\ndeep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg.\n","authors":["Dae Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.03981v2","updated":"2023-07-24T09:24:04Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n Identification","summary":" Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12637v1","updated":"2023-07-24T09:22:09Z","published":"2023-07-24T09:22:09Z","title":"PG-RCNN: Semantic Surface Point Generation for 3D Object Detection","summary":" One of the main challenges in LiDAR-based 3D object detection is that the\nsensors often fail to capture the complete spatial information about the\nobjects due to long distance and occlusion. Two-stage detectors with point\ncloud completion approaches tackle this problem by adding more points to the\nregions of interest (RoIs) with a pre-trained network. However, these methods\ngenerate dense point clouds of objects for all region proposals, assuming that\nobjects always exist in the RoIs. This leads to the indiscriminate point\ngeneration for incorrect proposals as well. Motivated by this, we propose Point\nGeneration R-CNN (PG-RCNN), a novel end-to-end detector that generates semantic\nsurface points of foreground objects for accurate detection. Our method uses a\njointly trained RoI point generation module to process the contextual\ninformation of RoIs and estimate the complete shape and displacement of\nforeground objects. For every generated point, PG-RCNN assigns a semantic\nfeature that indicates the estimated foreground probability. Extensive\nexperiments show that the point clouds generated by our method provide\ngeometrically and semantically rich information for refining false positive and\nmisaligned proposals. PG-RCNN achieves competitive performance on the KITTI\nbenchmark, with significantly fewer parameters than state-of-the-art models.\nThe code is available at https://github.com/quotation2520/PG-RCNN.\n","authors":["Inyong Koo","Inyoung Lee","Se-Ho Kim","Hee-Seon Kim","Woo-jin Jeon","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12637v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11643v2","updated":"2023-07-24T09:18:52Z","published":"2023-07-21T15:22:32Z","title":"Morphological Image Analysis and Feature Extraction for Reasoning with\n AI-based Defect Detection and Classification Models","summary":" As the use of artificial intelligent (AI) models becomes more prevalent in\nindustries such as engineering and manufacturing, it is essential that these\nmodels provide transparent reasoning behind their predictions. This paper\nproposes the AI-Reasoner, which extracts the morphological characteristics of\ndefects (DefChars) from images and utilises decision trees to reason with the\nDefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.\ncharts) and textual explanations to provide insights into outputs made by\nmasked-based defect detection and classification models. It also provides\neffective mitigation strategies to enhance data pre-processing and overall\nmodel performance. The AI-Reasoner was tested on explaining the outputs of an\nIE Mask R-CNN model using a set of 366 images containing defects. The results\ndemonstrated its effectiveness in explaining the IE Mask R-CNN model's\npredictions. Overall, the proposed AI-Reasoner provides a solution for\nimproving the performance of AI models in industrial applications that require\ndefect analysis.\n","authors":["Jiajun Zhang","Georgina Cosma","Sarah Bugby","Axel Finke","Jason Watkins"],"pdf_url":"https://arxiv.org/pdf/2307.11643v2.pdf","comment":"8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series\n on computational intelligence (SSCI)"},{"id":"http://arxiv.org/abs/2307.12634v1","updated":"2023-07-24T09:16:05Z","published":"2023-07-24T09:16:05Z","title":"Automatic lobe segmentation using attentive cross entropy and end-to-end\n fissure generation","summary":" The automatic lung lobe segmentation algorithm is of great significance for\nthe diagnosis and treatment of lung diseases, however, which has great\nchallenges due to the incompleteness of pulmonary fissures in lung CT images\nand the large variability of pathological features. Therefore, we propose a new\nautomatic lung lobe segmentation framework, in which we urge the model to pay\nattention to the area around the pulmonary fissure during the training process,\nwhich is realized by a task-specific loss function. In addition, we introduce\nan end-to-end pulmonary fissure generation method in the auxiliary pulmonary\nfissure segmentation task, without any additional network branch. Finally, we\npropose a registration-based loss function to alleviate the convergence\ndifficulty of the Dice loss supervised pulmonary fissure segmentation task. We\nachieve 97.83% and 94.75% dice scores on our private dataset STLB and public\nLUNA16 dataset respectively.\n","authors":["Qi Su","Na Wang","Jiawen Xie","Yinan Chen","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12634v1.pdf","comment":"5 pages, 3 figures, published to 'IEEE International Symposium on\n Biomedical Imaging (ISBI) 2023'"},{"id":"http://arxiv.org/abs/2307.12630v1","updated":"2023-07-24T09:08:30Z","published":"2023-07-24T09:08:30Z","title":"Semi-Supervised Medical Image Segmentation with Co-Distribution\n Alignment","summary":" Medical image segmentation has made significant progress when a large amount\nof labeled data are available. However, annotating medical image segmentation\ndatasets is expensive due to the requirement of professional skills.\nAdditionally, classes are often unevenly distributed in medical images, which\nseverely affects the classification performance on minority classes. To address\nthese problems, this paper proposes Co-Distribution Alignment (Co-DA) for\nsemi-supervised medical image segmentation. Specifically, Co-DA aligns marginal\npredictions on unlabeled data to marginal predictions on labeled data in a\nclass-wise manner with two differently initialized models before using the\npseudo-labels generated by one model to supervise the other. Besides, we design\nan over-expectation cross-entropy loss for filtering the unlabeled pixels to\nreduce noise in their pseudo-labels. Quantitative and qualitative experiments\non three public datasets demonstrate that the proposed approach outperforms\nexisting state-of-the-art semi-supervised medical image segmentation methods on\nboth the 2D CaDIS dataset and the 3D LGE-MRI and ACDC datasets, achieving an\nmIoU of 0.8515 with only 24% labeled data on CaDIS, and a Dice score of 0.8824\nand 0.8773 with only 20% data on LGE-MRI and ACDC, respectively.\n","authors":["Tao Wang","Zhongzheng Huang","Jiawei Wu","Yuanzheng Cai","Zuoyong Li"],"pdf_url":"https://arxiv.org/pdf/2307.12630v1.pdf","comment":"Paper appears in Bioengineering 2023, 10(7), 869"},{"id":"http://arxiv.org/abs/2307.12622v1","updated":"2023-07-24T08:51:49Z","published":"2023-07-24T08:51:49Z","title":"Phase Match for Out-of-Distribution Generalization","summary":" The Fourier transform, serving as an explicit decomposition method for visual\nsignals, has been employed to explain the out-of-distribution generalization\nbehaviors of Convolutional Neural Networks (CNNs). Previous research and\nempirical studies have indicated that the amplitude spectrum plays a decisive\nrole in CNN recognition, but it is susceptible to disturbance caused by\ndistribution shifts. On the other hand, the phase spectrum preserves\nhighly-structured spatial information, which is crucial for visual\nrepresentation learning. In this paper, we aim to clarify the relationships\nbetween Domain Generalization (DG) and the frequency components by introducing\na Fourier-based structural causal model. Specifically, we interpret the phase\nspectrum as semi-causal factors and the amplitude spectrum as non-causal\nfactors. Building upon these observations, we propose Phase Match (PhaMa) to\naddress DG problems. Our method introduces perturbations on the amplitude\nspectrum and establishes spatial relationships to match the phase components.\nThrough experiments on multiple benchmarks, we demonstrate that our proposed\nmethod achieves state-of-the-art performance in domain generalization and\nout-of-distribution robustness tasks.\n","authors":["Chengming Hu","Rui Wang","Hao Chen","Zhouwang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12619v1","updated":"2023-07-24T08:49:20Z","published":"2023-07-24T08:49:20Z","title":"Sparse annotation strategies for segmentation of short axis cardiac MRI","summary":" Short axis cardiac MRI segmentation is a well-researched topic, with\nexcellent results achieved by state-of-the-art models in a supervised setting.\nHowever, annotating MRI volumes is time-consuming and expensive. Many different\napproaches (e.g. transfer learning, data augmentation, few-shot learning, etc.)\nhave emerged in an effort to use fewer annotated data and still achieve similar\nperformance as a fully supervised model. Nevertheless, to the best of our\nknowledge, none of these works focus on which slices of MRI volumes are most\nimportant to annotate for yielding the best segmentation results. In this\npaper, we investigate the effects of training with sparse volumes, i.e.\nreducing the number of cases annotated, and sparse annotations, i.e. reducing\nthe number of slices annotated per case. We evaluate the segmentation\nperformance using the state-of-the-art nnU-Net model on two public datasets to\nidentify which slices are the most important to annotate. We have shown that\ntraining on a significantly reduced dataset (48 annotated volumes) can give a\nDice score greater than 0.85 and results comparable to using the full dataset\n(160 and 240 volumes for each dataset respectively). In general, training on\nmore slice annotations provides more valuable information compared to training\non more volumes. Further, annotating slices from the middle of volumes yields\nthe most beneficial results in terms of segmentation performance, and the\napical region the worst. When evaluating the trade-off between annotating\nvolumes against slices, annotating as many slices as possible instead of\nannotating more volumes is a better strategy.\n","authors":["Josh Stein","Maxime Di Folco","Julia Schnabel"],"pdf_url":"https://arxiv.org/pdf/2307.12619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12618v1","updated":"2023-07-24T08:47:45Z","published":"2023-07-24T08:47:45Z","title":"Attribute Regularized Soft Introspective VAE: Towards Cardiac Attribute\n Regularization Through MRI Domains","summary":" Deep generative models have emerged as influential instruments for data\ngeneration and manipulation. Enhancing the controllability of these models by\nselectively modifying data attributes has been a recent focus. Variational\nAutoencoders (VAEs) have shown promise in capturing hidden attributes but often\nproduce blurry reconstructions. Controlling these attributes through different\nimaging domains is difficult in medical imaging. Recently, Soft Introspective\nVAE leverage the benefits of both VAEs and Generative Adversarial Networks\n(GANs), which have demonstrated impressive image synthesis capabilities, by\nincorporating an adversarial loss into VAE training. In this work, we propose\nthe Attributed Soft Introspective VAE (Attri-SIVAE) by incorporating an\nattribute regularized loss, into the Soft-Intro VAE framework. We evaluate\nexperimentally the proposed method on cardiac MRI data from different domains,\nsuch as various scanner vendors and acquisition centers. The proposed method\nachieves similar performance in terms of reconstruction and regularization\ncompared to the state-of-the-art Attributed regularized VAE but additionally\nalso succeeds in keeping the same regularization level when tested on a\ndifferent dataset, unlike the compared method.\n","authors":["Maxime Di Folco","Cosmin Bercea","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2307.12618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12616v1","updated":"2023-07-24T08:44:25Z","published":"2023-07-24T08:44:25Z","title":"CTVIS: Consistent Training for Online Video Instance Segmentation","summary":" The discrimination of instance embeddings plays a vital role in associating\ninstances across time for online video instance segmentation (VIS). Instance\nembedding learning is directly supervised by the contrastive loss computed upon\nthe contrastive items (CIs), which are sets of anchor/positive/negative\nembeddings. Recent online VIS methods leverage CIs sourced from one reference\nframe only, which we argue is insufficient for learning highly discriminative\nembeddings. Intuitively, a possible strategy to enhance CIs is replicating the\ninference phase during training. To this end, we propose a simple yet effective\ntraining strategy, called Consistent Training for Online VIS (CTVIS), which\ndevotes to aligning the training and inference pipelines in terms of building\nCIs. Specifically, CTVIS constructs CIs by referring inference the\nmomentum-averaged embedding and the memory bank storage mechanisms, and adding\nnoise to the relevant embeddings. Such an extension allows a reliable\ncomparison between embeddings of current instances and the stable\nrepresentations of historical instances, thereby conferring an advantage in\nmodeling VIS challenges such as occlusion, re-identification, and deformation.\nEmpirically, CTVIS outstrips the SOTA VIS models by up to +5.0 points on three\nVIS benchmarks, including YTVIS19 (55.1% AP), YTVIS21 (50.1% AP) and OVIS\n(35.5% AP). Furthermore, we find that pseudo-videos transformed from images can\ntrain robust models surpassing fully-supervised ones.\n","authors":["Kaining Ying","Qing Zhong","Weian Mao","Zhenhua Wang","Hao Chen","Lin Yuanbo Wu","Yifan Liu","Chengxiang Fan","Yunzhi Zhuge","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2307.12616v1.pdf","comment":"Accepted by ICCV 2023. The code is available at\n https://github.com/KainingYing/CTVIS"},{"id":"http://arxiv.org/abs/2307.12612v1","updated":"2023-07-24T08:39:11Z","published":"2023-07-24T08:39:11Z","title":"Less is More: Focus Attention for Efficient DETR","summary":" DETR-like models have significantly boosted the performance of detectors and\neven outperformed classical convolutional models. However, all tokens are\ntreated equally without discrimination brings a redundant computational burden\nin the traditional encoder structure. The recent sparsification strategies\nexploit a subset of informative tokens to reduce attention complexity\nmaintaining performance through the sparse encoder. But these methods tend to\nrely on unreliable model statistics. Moreover, simply reducing the token\npopulation hinders the detection performance to a large extent, limiting the\napplication of these sparse models. We propose Focus-DETR, which focuses\nattention on more informative tokens for a better trade-off between computation\nefficiency and model accuracy. Specifically, we reconstruct the encoder with\ndual attention, which includes a token scoring mechanism that considers both\nlocalization and category semantic information of the objects from multi-scale\nfeature maps. We efficiently abandon the background queries and enhance the\nsemantic interaction of the fine-grained object queries based on the scores.\nCompared with the state-of-the-art sparse DETR-like detectors under the same\nsetting, our Focus-DETR gets comparable complexity while achieving 50.4AP\n(+2.2) on COCO. The code is available at\nhttps://github.com/huawei-noah/noah-research/tree/master/Focus-DETR and\nhttps://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR.\n","authors":["Dehua Zheng","Wenhui Dong","Hailin Hu","Xinghao Chen","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12612v1.pdf","comment":"8 pages, 6 figures, accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.12607v1","updated":"2023-07-24T08:32:27Z","published":"2023-07-24T08:32:27Z","title":"ExWarp: Extrapolation and Warping-based Temporal Supersampling for\n High-frequency Displays","summary":" High-frequency displays are gaining immense popularity because of their\nincreasing use in video games and virtual reality applications. However, the\nissue is that the underlying GPUs cannot continuously generate frames at this\nhigh rate -- this results in a less smooth and responsive experience.\nFurthermore, if the frame rate is not synchronized with the refresh rate, the\nuser may experience screen tearing and stuttering. Previous works propose\nincreasing the frame rate to provide a smooth experience on modern displays by\npredicting new frames based on past or future frames. Interpolation and\nextrapolation are two widely used algorithms that predict new frames.\nInterpolation requires waiting for the future frame to make a prediction, which\nadds additional latency. On the other hand, extrapolation provides a better\nquality of experience because it relies solely on past frames -- it does not\nincur any additional latency. The simplest method to extrapolate a frame is to\nwarp the previous frame using motion vectors; however, the warped frame may\ncontain improperly rendered visual artifacts due to dynamic objects -- this\nmakes it very challenging to design such a scheme. Past work has used DNNs to\nget good accuracy, however, these approaches are slow. This paper proposes\nExwarp -- an approach based on reinforcement learning (RL) to intelligently\nchoose between the slower DNN-based extrapolation and faster warping-based\nmethods to increase the frame rate by 4x with an almost negligible reduction in\nthe perceived image quality.\n","authors":["Akanksha Dixit","Yashashwee Chakrabarty","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2307.12607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07515v2","updated":"2023-07-24T08:10:52Z","published":"2023-04-15T09:39:52Z","title":"S3M: Scalable Statistical Shape Modeling through Unsupervised\n Correspondences","summary":" Statistical shape models (SSMs) are an established way to represent the\nanatomy of a population with various clinically relevant applications. However,\nthey typically require domain expertise, and labor-intensive landmark\nannotations to construct. We address these shortcomings by proposing an\nunsupervised method that leverages deep geometric features and functional\ncorrespondences to simultaneously learn local and global shape structures\nacross population anatomies. Our pipeline significantly improves unsupervised\ncorrespondence estimation for SSMs compared to baseline methods, even on highly\nirregular surface topologies. We demonstrate this for two different anatomical\nstructures: the thyroid and a multi-chamber heart dataset. Furthermore, our\nmethod is robust enough to learn from noisy neural network predictions,\npotentially enabling scaling SSMs to larger patient populations without manual\nsegmentation annotation.\n","authors":["Lennart Bastian","Alexander Baumann","Emily Hoppe","Vincent Bürgin","Ha Young Kim","Mahdi Saleh","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2304.07515v2.pdf","comment":"Accepted at MICCAI 2023. 13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12591v1","updated":"2023-07-24T08:06:46Z","published":"2023-07-24T08:06:46Z","title":"SwinMM: Masked Multi-view with Swin Transformers for 3D Medical Image\n Segmentation","summary":" Recent advancements in large-scale Vision Transformers have made significant\nstrides in improving pre-trained models for medical image segmentation.\nHowever, these methods face a notable challenge in acquiring a substantial\namount of pre-training data, particularly within the medical field. To address\nthis limitation, we present Masked Multi-view with Swin Transformers (SwinMM),\na novel multi-view pipeline for enabling accurate and data-efficient\nself-supervised medical image analysis. Our strategy harnesses the potential of\nmulti-view information by incorporating two principal components. In the\npre-training phase, we deploy a masked multi-view encoder devised to\nconcurrently train masked multi-view observations through a range of diverse\nproxy tasks. These tasks span image reconstruction, rotation, contrastive\nlearning, and a novel task that employs a mutual learning paradigm. This new\ntask capitalizes on the consistency between predictions from various\nperspectives, enabling the extraction of hidden multi-view information from 3D\nmedical data. In the fine-tuning stage, a cross-view decoder is developed to\naggregate the multi-view information through a cross-attention block. Compared\nwith the previous state-of-the-art self-supervised learning method Swin UNETR,\nSwinMM demonstrates a notable advantage on several medical image segmentation\ntasks. It allows for a smooth integration of multi-view information,\nsignificantly boosting both the accuracy and data-efficiency of the model. Code\nand models are available at https://github.com/UCSC-VLAA/SwinMM/.\n","authors":["Yiqing Wang","Zihan Li","Jieru Mei","Zihao Wei","Li Liu","Chen Wang","Shengtian Sang","Alan Yuille","Cihang Xie","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12591v1.pdf","comment":"MICCAI 2023; project page: https://github.com/UCSC-VLAA/SwinMM/"},{"id":"http://arxiv.org/abs/2307.12580v1","updated":"2023-07-24T07:51:40Z","published":"2023-07-24T07:51:40Z","title":"SL: Stable Learning in Source-Free Domain Adaption for Medical Image\n Segmentation","summary":" Deep learning techniques for medical image analysis usually suffer from the\ndomain shift between source and target data. Most existing works focus on\nunsupervised domain adaptation (UDA). However, in practical applications,\nprivacy issues are much more severe. For example, the data of different\nhospitals have domain shifts due to equipment problems, and data of the two\ndomains cannot be available simultaneously because of privacy. In this\nchallenge defined as Source-Free UDA, the previous UDA medical methods are\nlimited. Although a variety of medical source-free unsupervised domain adaption\n(MSFUDA) methods have been proposed, we found they fall into an over-fitting\ndilemma called \"longer training, worse performance.\" Therefore, we propose the\nStable Learning (SL) strategy to address the dilemma. SL is a scalable method\nand can be integrated with other research, which consists of Weight\nConsolidation and Entropy Increase. First, we apply Weight Consolidation to\nretain domain-invariant knowledge and then we design Entropy Increase to avoid\nover-learning. Comparative experiments prove the effectiveness of SL. We also\nhave done extensive ablation experiments. Besides, We will release codes\nincluding a variety of MSFUDA methods.\n","authors":["Yixin Chen","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12577v1","updated":"2023-07-24T07:49:01Z","published":"2023-07-24T07:49:01Z","title":"PRIOR: Prototype Representation Joint Learning from Medical Images and\n Reports","summary":" Contrastive learning based vision-language joint pre-training has emerged as\na successful representation learning strategy. In this paper, we present a\nprototype representation learning framework incorporating both global and local\nalignment between medical images and reports. In contrast to standard global\nmulti-modality alignment methods, we employ a local alignment module for\nfine-grained representation. Furthermore, a cross-modality conditional\nreconstruction module is designed to interchange information across modalities\nin the training phase by reconstructing masked images and reports. For\nreconstructing long reports, a sentence-wise prototype memory bank is\nconstructed, enabling the network to focus on low-level localized visual and\nhigh-level clinical linguistic features. Additionally, a non-auto-regressive\ngeneration paradigm is proposed for reconstructing non-sequential reports.\nExperimental results on five downstream tasks, including supervised\nclassification, zero-shot classification, image-to-text retrieval, semantic\nsegmentation, and object detection, show the proposed method outperforms other\nstate-of-the-art methods across multiple datasets and under different dataset\nsize settings. The code is available at https://github.com/QtacierP/PRIOR.\n","authors":["Pujin Cheng","Li Lin","Junyan Lyu","Yijin Huang","Wenhan Luo","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2307.12577v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12574v1","updated":"2023-07-24T07:46:06Z","published":"2023-07-24T07:46:06Z","title":"A Good Student is Cooperative and Reliable: CNN-Transformer\n Collaborative Learning for Semantic Segmentation","summary":" In this paper, we strive to answer the question \"how to collaboratively learn\nconvolutional neural network (CNN)-based and vision transformer (ViT)-based\nmodels by selecting and exchanging the reliable knowledge between them for\nsemantic segmentation?\" Accordingly, we propose an online knowledge\ndistillation (KD) framework that can simultaneously learn compact yet effective\nCNN-based and ViT-based models with two key technical breakthroughs to take\nfull advantage of CNNs and ViT while compensating their limitations. Firstly,\nwe propose heterogeneous feature distillation (HFD) to improve students'\nconsistency in low-layer feature space by mimicking heterogeneous features\nbetween CNNs and ViT. Secondly, to facilitate the two students to learn\nreliable knowledge from each other, we propose bidirectional selective\ndistillation (BSD) that can dynamically transfer selective knowledge. This is\nachieved by 1) region-wise BSD determining the directions of knowledge\ntransferred between the corresponding regions in the feature space and 2)\npixel-wise BSD discerning which of the prediction knowledge to be transferred\nin the logit space. Extensive experiments on three benchmark datasets\ndemonstrate that our proposed framework outperforms the state-of-the-art online\ndistillation methods by a large margin, and shows its efficacy in learning\ncollaboratively between ViT-based and CNN-based models.\n","authors":["Jinjing Zhu","Yunhao Luo","Xu Zheng","Hao Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12574v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2210.10495v3","updated":"2023-07-24T07:43:31Z","published":"2022-10-19T12:04:47Z","title":"ADPS: Asymmetric Distillation Post-Segmentation for Image Anomaly\n Detection","summary":" Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the\nteacher-student paradigm to detect and segment anomalous regions by contrasting\nthe unique features extracted by both networks. However, existing KDAD methods\nsuffer from two main limitations: 1) the student network can effortlessly\nreplicate the teacher network's representations, and 2) the features of the\nteacher network serve solely as a ``reference standard\" and are not fully\nleveraged. Toward this end, we depart from the established paradigm and instead\npropose an innovative approach called Asymmetric Distillation Post-Segmentation\n(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes\ndistinct forms of the same image as the input of the teacher-student networks,\ndriving the student network to learn discriminating representations for\nanomalous regions.\n Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a\ncoarse anomaly localization mask that transfers the distilled knowledge\nacquired from the asymmetric paradigm to the teacher network. Equipped with\nWMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect\nand segment abnormal regions with fine structures and clear boundaries.\nExperimental results demonstrate that the proposed ADPS outperforms the\nstate-of-the-art methods in detecting and segmenting anomalies. Surprisingly,\nADPS significantly improves Average Precision (AP) metric by 9% and 20% on the\nMVTec AD and KolektorSDD2 datasets, respectively.\n","authors":["Peng Xing","Hao Tang","Jinhui Tang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2210.10495v3.pdf","comment":"11pages,9 figures"},{"id":"http://arxiv.org/abs/2307.12571v1","updated":"2023-07-24T07:39:22Z","published":"2023-07-24T07:39:22Z","title":"MataDoc: Margin and Text Aware Document Dewarping for Arbitrary Boundary","summary":" Document dewarping from a distorted camera-captured image is of great value\nfor OCR and document understanding. The document boundary plays an important\nrole which is more evident than the inner region in document dewarping. Current\nlearning-based methods mainly focus on complete boundary cases, leading to poor\ndocument correction performance of documents with incomplete boundaries. In\ncontrast to these methods, this paper proposes MataDoc, the first method\nfocusing on arbitrary boundary document dewarping with margin and text aware\nregularizations. Specifically, we design the margin regularization by\nexplicitly considering background consistency to enhance boundary perception.\nMoreover, we introduce word position consistency to keep text lines straight in\nrectified document images. To produce a comprehensive evaluation of MataDoc, we\npropose a novel benchmark ArbDoc, mainly consisting of document images with\narbitrary boundaries in four typical scenarios. Extensive experiments confirm\nthe superiority of MataDoc with consideration for the incomplete boundary on\nArbDoc and also demonstrate the effectiveness of the proposed method on\nDocUNet, DIR300, and WarpDoc datasets.\n","authors":["Beiya Dai","Xing li","Qunyi Xie","Yulin Li","Xiameng Qin","Chengquan Zhang","Kun Yao","Junyu Han"],"pdf_url":"https://arxiv.org/pdf/2307.12571v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2307.12560v1","updated":"2023-07-24T07:03:22Z","published":"2023-07-24T07:03:22Z","title":"Interpolating between Images with Diffusion Models","summary":" One little-explored frontier of image generation and editing is the task of\ninterpolating between two input images, a feature missing from all currently\ndeployed image generation pipelines. We argue that such a feature can expand\nthe creative applications of such models, and propose a method for zero-shot\ninterpolation using latent diffusion models. We apply interpolation in the\nlatent space at a sequence of decreasing noise levels, then perform denoising\nconditioned on interpolated text embeddings derived from textual inversion and\n(optionally) subject poses. For greater consistency, or to specify additional\ncriteria, we can generate several candidates and use CLIP to select the highest\nquality image. We obtain convincing interpolations across diverse subject\nposes, image styles, and image content, and show that standard quantitative\nmetrics such as FID are insufficient to measure the quality of an\ninterpolation. Code and data are available at\nhttps://clintonjwang.github.io/interpolation.\n","authors":["Clinton J. Wang","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2307.12560v1.pdf","comment":"Presented at ICML 2023 Workshop on Challenges of Deploying Generative\n AI"},{"id":"http://arxiv.org/abs/2203.01923v4","updated":"2023-07-24T06:59:56Z","published":"2022-03-03T18:56:08Z","title":"Recovering 3D Human Mesh from Monocular Images: A Survey","summary":" Estimating human pose and shape from monocular images is a long-standing\nproblem in computer vision. Since the release of statistical body models, 3D\nhuman mesh recovery has been drawing broader attention. With the same goal of\nobtaining well-aligned and physically plausible mesh results, two paradigms\nhave been developed to overcome challenges in the 2D-to-3D lifting process: i)\nan optimization-based paradigm, where different data terms and regularization\nterms are exploited as optimization objectives; and ii) a regression-based\nparadigm, where deep learning techniques are embraced to solve the problem in\nan end-to-end fashion. Meanwhile, continuous efforts are devoted to improving\nthe quality of 3D mesh labels for a wide range of datasets. Though remarkable\nprogress has been achieved in the past decade, the task is still challenging\ndue to flexible body motions, diverse appearances, complex environments, and\ninsufficient in-the-wild annotations. To the best of our knowledge, this is the\nfirst survey that focuses on the task of monocular 3D human mesh recovery. We\nstart with the introduction of body models and then elaborate recovery\nframeworks and training objectives by providing in-depth analyses of their\nstrengths and weaknesses. We also summarize datasets, evaluation metrics, and\nbenchmark results. Open issues and future directions are discussed in the end,\nhoping to motivate researchers and facilitate their research in this area. A\nregularly updated project page can be found at\nhttps://github.com/tinatiansjz/hmr-survey.\n","authors":["Yating Tian","Hongwen Zhang","Yebin Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2203.01923v4.pdf","comment":"Accepted to IEEE TPAMI, Survey on monocular 3D human mesh recovery,\n Project page: https://github.com/tinatiansjz/hmr-survey"},{"id":"http://arxiv.org/abs/2307.12558v1","updated":"2023-07-24T06:51:07Z","published":"2023-07-24T06:51:07Z","title":"Revisiting Event-based Video Frame Interpolation","summary":" Dynamic vision sensors or event cameras provide rich complementary\ninformation for video frame interpolation. Existing state-of-the-art methods\nfollow the paradigm of combining both synthesis-based and warping networks.\nHowever, few of those methods fully respect the intrinsic characteristics of\nevents streams. Given that event cameras only encode intensity changes and\npolarity rather than color intensities, estimating optical flow from events is\narguably more difficult than from RGB information. We therefore propose to\nincorporate RGB information in an event-guided optical flow refinement\nstrategy. Moreover, in light of the quasi-continuous nature of the time signals\nprovided by event cameras, we propose a divide-and-conquer strategy in which\nevent-based intermediate frame synthesis happens incrementally in multiple\nsimplified stages rather than in a single, long stage. Extensive experiments on\nboth synthetic and real-world datasets show that these modifications lead to\nmore reliable and realistic intermediate frame results than previous video\nframe interpolation methods. Our findings underline that a careful\nconsideration of event characteristics such as high temporal density and\nelevated noise benefits interpolation accuracy.\n","authors":["Jiaben Chen","Yichen Zhu","Dongze Lian","Jiaqi Yang","Yifu Wang","Renrui Zhang","Xinhang Liu","Shenhan Qian","Laurent Kneip","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2307.12558v1.pdf","comment":"Accepted by IROS2023 Project Site:\n https://jiabenchen.github.io/revisit_event"},{"id":"http://arxiv.org/abs/2307.12548v1","updated":"2023-07-24T06:33:52Z","published":"2023-07-24T06:33:52Z","title":"MFMAN-YOLO: A Method for Detecting Pole-like Obstacles in Complex\n Environment","summary":" In real-world traffic, there are various uncertainties and complexities in\nroad and weather conditions. To solve the problem that the feature information\nof pole-like obstacles in complex environments is easily lost, resulting in low\ndetection accuracy and low real-time performance, a multi-scale hybrid\nattention mechanism detection algorithm is proposed in this paper. First, the\noptimal transport function Monge-Kantorovich (MK) is incorporated not only to\nsolve the problem of overlapping multiple prediction frames with optimal\nmatching but also the MK function can be regularized to prevent model\nover-fitting; then, the features at different scales are up-sampled separately\naccording to the optimized efficient multi-scale feature pyramid. Finally, the\nextraction of multi-scale feature space channel information is enhanced in\ncomplex environments based on the hybrid attention mechanism, which suppresses\nthe irrelevant complex environment background information and focuses the\nfeature information of pole-like obstacles. Meanwhile, this paper conducts real\nroad test experiments in a variety of complex environments. The experimental\nresults show that the detection precision, recall, and average precision of the\nmethod are 94.7%, 93.1%, and 97.4%, respectively, and the detection frame rate\nis 400 f/s. This research method can detect pole-like obstacles in a complex\nroad environment in real time and accurately, which further promotes innovation\nand progress in the field of automatic driving.\n","authors":["Lei Cai","Hao Wang","Congling Zhou","Yongqiang Wang","Boyu Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12548v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2301.01482v5","updated":"2023-07-24T06:31:58Z","published":"2023-01-04T08:22:34Z","title":"Underwater Object Tracker: UOSTrack for Marine Organism Grasping of\n Underwater Vehicles","summary":" A visual single-object tracker is an indispensable component of underwater\nvehicles (UVs) in marine organism grasping tasks. Its accuracy and stability\nare imperative to guide the UVs to perform grasping behavior. Although\nsingle-object trackers show competitive performance in the challenge of\nunderwater image degradation, there are still issues with sample imbalance and\nexclusion of similar objects that need to be addressed for application in\nmarine organism grasping. This paper proposes Underwater OSTrack (UOSTrack),\nwhich consists of underwater image and open-air sequence hybrid training\n(UOHT), and motion-based post-processing (MBPP). The UOHT training paradigm is\ndesigned to train the sample-imbalanced underwater tracker so that the tracker\nis exposed to a great number of underwater domain training samples and learns\nthe feature expressions. The MBPP paradigm is proposed to exclude similar\nobjects. It uses the estimation box predicted with a Kalman filter and the\ncandidate boxes in the response map to relocate the lost tracked object in the\ncandidate area. UOSTrack achieves an average performance improvement of 4.41%\nand 7.98% maximum compared to state-of-the-art methods on various benchmarks,\nrespectively. Field experiments have verified the accuracy and stability of our\nproposed UOSTrack for UVs in marine organism grasping tasks. More details can\nbe found at https://github.com/LiYunfengLYF/UOSTrack.\n","authors":["Yunfeng Li","Bo Wang","Ye Li","Zhuoyan Liu","Wei Huo","Yueming Li","Jian Cao"],"pdf_url":"https://arxiv.org/pdf/2301.01482v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12545v1","updated":"2023-07-24T06:22:37Z","published":"2023-07-24T06:22:37Z","title":"Towards Video Anomaly Retrieval from Video Anomaly Detection: New\n Benchmarks and Model","summary":" Video anomaly detection (VAD) has been paid increasing attention due to its\npotential applications, its current dominant tasks focus on online detecting\nanomalies% at the frame level, which can be roughly interpreted as the binary\nor multiple event classification. However, such a setup that builds\nrelationships between complicated anomalous events and single labels, e.g.,\n``vandalism'', is superficial, since single labels are deficient to\ncharacterize anomalous events. In reality, users tend to search a specific\nvideo rather than a series of approximate videos. Therefore, retrieving\nanomalous events using detailed descriptions is practical and positive but few\nresearches focus on this. In this context, we propose a novel task called Video\nAnomaly Retrieval (VAR), which aims to pragmatically retrieve relevant\nanomalous videos by cross-modalities, e.g., language descriptions and\nsynchronous audios. Unlike the current video retrieval where videos are assumed\nto be temporally well-trimmed with short duration, VAR is devised to retrieve\nlong untrimmed videos which may be partially relevant to the given query. To\nachieve this, we present two large-scale VAR benchmarks, UCFCrime-AR and\nXDViolence-AR, constructed on top of prevalent anomaly datasets. Meanwhile, we\ndesign a model called Anomaly-Led Alignment Network (ALAN) for VAR. In ALAN, we\npropose an anomaly-led sampling to focus on key segments in long untrimmed\nvideos. Then, we introduce an efficient pretext task to enhance semantic\nassociations between video-text fine-grained representations. Besides, we\nleverage two complementary alignments to further match cross-modal contents.\nExperimental results on two benchmarks reveal the challenges of VAR task and\nalso demonstrate the advantages of our tailored method.\n","authors":["Peng Wu","Jing Liu","Xiangteng He","Yuxin Peng","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12545v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2307.12542v1","updated":"2023-07-24T06:12:37Z","published":"2023-07-24T06:12:37Z","title":"Client-Level Differential Privacy via Adaptive Intermediary in Federated\n Medical Imaging","summary":" Despite recent progress in enhancing the privacy of federated learning (FL)\nvia differential privacy (DP), the trade-off of DP between privacy protection\nand performance is still underexplored for real-world medical scenario. In this\npaper, we propose to optimize the trade-off under the context of client-level\nDP, which focuses on privacy during communications. However, FL for medical\nimaging involves typically much fewer participants (hospitals) than other\ndomains (e.g., mobile devices), thus ensuring clients be differentially private\nis much more challenging. To tackle this problem, we propose an adaptive\nintermediary strategy to improve performance without harming privacy.\nSpecifically, we theoretically find splitting clients into sub-clients, which\nserve as intermediaries between hospitals and the server, can mitigate the\nnoises introduced by DP without harming privacy. Our proposed approach is\nempirically evaluated on both classification and segmentation tasks using two\npublic datasets, and its effectiveness is demonstrated with significant\nperformance improvements and comprehensive analytical studies. Code is\navailable at: https://github.com/med-air/Client-DP-FL.\n","authors":["Meirui Jiang","Yuan Zhong","Anjie Le","Xiaoxiao Li","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2307.12542v1.pdf","comment":"Accepted by 26th International Conference on Medical Image Computing\n and Computer Assisted Intervention (MICCAI'23)"},{"id":"http://arxiv.org/abs/2303.05021v3","updated":"2023-07-24T06:06:27Z","published":"2023-03-09T03:48:24Z","title":"DiffusionDepth: Diffusion Denoising Approach for Monocular Depth\n Estimation","summary":" Monocular depth estimation is a challenging task that predicts the pixel-wise\ndepth from a single 2D image. Current methods typically model this problem as a\nregression or classification task. We propose DiffusionDepth, a new approach\nthat reformulates monocular depth estimation as a denoising diffusion process.\nIt learns an iterative denoising process to `denoise' random depth distribution\ninto a depth map with the guidance of monocular visual conditions. The process\nis performed in the latent space encoded by a dedicated depth encoder and\ndecoder. Instead of diffusing ground truth (GT) depth, the model learns to\nreverse the process of diffusing the refined depth of itself into random depth\ndistribution. This self-diffusion formulation overcomes the difficulty of\napplying generative models to sparse GT depth scenarios. The proposed approach\nbenefits this task by refining depth estimation step by step, which is superior\nfor generating accurate and highly detailed depth maps. Experimental results on\nKITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion\napproach could reach state-of-the-art performance in both indoor and outdoor\nscenarios with acceptable inference time.\n","authors":["Yiqun Duan","Xianda Guo","Zheng Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.05021v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12540v1","updated":"2023-07-24T06:04:12Z","published":"2023-07-24T06:04:12Z","title":"SelFormaly: Towards Task-Agnostic Unified Anomaly Detection","summary":" The core idea of visual anomaly detection is to learn the normality from\nnormal images, but previous works have been developed specifically for certain\ntasks, leading to fragmentation among various tasks: defect detection, semantic\nanomaly detection, multi-class anomaly detection, and anomaly clustering. This\none-task-one-model approach is resource-intensive and incurs high maintenance\ncosts as the number of tasks increases. This paper presents SelFormaly, a\nuniversal and powerful anomaly detection framework. We emphasize the necessity\nof our off-the-shelf approach by pointing out a suboptimal issue with\nfluctuating performance in previous online encoder-based methods. In addition,\nwe question the effectiveness of using ConvNets as previously employed in the\nliterature and confirm that self-supervised ViTs are suitable for unified\nanomaly detection. We introduce back-patch masking and discover the new role of\ntop k-ratio feature matching to achieve unified and powerful anomaly detection.\nBack-patch masking eliminates irrelevant regions that possibly hinder\ntarget-centric detection with representations of the scene layout. The top\nk-ratio feature matching unifies various anomaly levels and tasks. Finally,\nSelFormaly achieves state-of-the-art results across various datasets for all\nthe aforementioned tasks.\n","authors":["Yujin Lee","Harin Lim","Hyunsoo Yoon"],"pdf_url":"https://arxiv.org/pdf/2307.12540v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12534v1","updated":"2023-07-24T05:43:34Z","published":"2023-07-24T05:43:34Z","title":"Towards Generalizable Deepfake Detection by Primary Region\n Regularization","summary":" The existing deepfake detection methods have reached a bottleneck in\ngeneralizing to unseen forgeries and manipulation approaches. Based on the\nobservation that the deepfake detectors exhibit a preference for overfitting\nthe specific primary regions in input, this paper enhances the generalization\ncapability from a novel regularization perspective. This can be simply achieved\nby augmenting the images through primary region removal, thereby preventing the\ndetector from over-relying on data bias. Our method consists of two stages,\nnamely the static localization for primary region maps, as well as the dynamic\nexploitation of primary region masks. The proposed method can be seamlessly\nintegrated into different backbones without affecting their inference\nefficiency. We conduct extensive experiments over three widely used deepfake\ndatasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method\ndemonstrates an average performance improvement of 6% across different\nbackbones and performs competitively with several state-of-the-art baselines.\n","authors":["Harry Cheng","Yangyang Guo","Tianyi Wang","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.12534v1.pdf","comment":"12 pages. Code and Dataset: https://github.com/xaCheng1996/PRLE"},{"id":"http://arxiv.org/abs/2307.12532v1","updated":"2023-07-24T05:36:19Z","published":"2023-07-24T05:36:19Z","title":"On the Connection between Pre-training Data Diversity and Fine-tuning\n Robustness","summary":" Pre-training has been widely adopted in deep learning to improve model\nperformance, especially when the training data for a target task is limited. In\nour work, we seek to understand the implications of this training strategy on\nthe generalization properties of downstream models. More specifically, we ask\nthe following question: how do properties of the pre-training distribution\naffect the robustness of a fine-tuned model? The properties we explore include\nthe label space, label semantics, image diversity, data domains, and data\nquantity of the pre-training distribution. We find that the primary factor\ninfluencing downstream effective robustness (Taori et al., 2020) is data\nquantity, while other factors have limited significance. For example, reducing\nthe number of ImageNet pre-training classes by 4x while increasing the number\nof images per class by 4x (that is, keeping total data quantity fixed) does not\nimpact the robustness of fine-tuned models. We demonstrate our findings on\npre-training distributions drawn from various natural and synthetic data\nsources, primarily using the iWildCam-WILDS distribution shift as a test for\ndownstream robustness.\n","authors":["Vivek Ramanujan","Thao Nguyen","Sewoong Oh","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2307.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.18246v3","updated":"2023-07-24T05:35:30Z","published":"2023-03-31T17:59:09Z","title":"3D Human Pose Estimation via Intuitive Physics","summary":" Estimating 3D humans from images often produces implausible bodies that lean,\nfloat, or penetrate the floor. Such methods ignore the fact that bodies are\ntypically supported by the scene. A physics engine can be used to enforce\nphysical plausibility, but these are not differentiable, rely on unrealistic\nproxy bodies, and are difficult to integrate into existing optimization and\nlearning frameworks. In contrast, we exploit novel intuitive-physics (IP) terms\nthat can be inferred from a 3D SMPL body interacting with the scene. Inspired\nby biomechanics, we infer the pressure heatmap on the body, the Center of\nPressure (CoP) from the heatmap, and the SMPL body's Center of Mass (CoM). With\nthese, we develop IPMAN, to estimate a 3D body from a color image in a \"stable\"\nconfiguration by encouraging plausible floor contact and overlapping CoP and\nCoM. Our IP terms are intuitive, easy to implement, fast to compute,\ndifferentiable, and can be integrated into existing optimization and regression\nmethods. We evaluate IPMAN on standard datasets and MoYo, a new dataset with\nsynchronized multi-view images, ground-truth 3D bodies with complex poses,\nbody-floor contact, CoM and pressure. IPMAN produces more plausible results\nthan the state of the art, improving accuracy for static poses, while not\nhurting dynamic ones. Code and data are available for research at\nhttps://ipman.is.tue.mpg.de.\n","authors":["Shashank Tripathi","Lea Müller","Chun-Hao P. Huang","Omid Taheri","Michael J. Black","Dimitrios Tzionas"],"pdf_url":"https://arxiv.org/pdf/2303.18246v3.pdf","comment":"Accepted in CVPR'23. Project page: https://ipman.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2307.12526v1","updated":"2023-07-24T04:56:23Z","published":"2023-07-24T04:56:23Z","title":"Rethinking Medical Report Generation: Disease Revealing Enhancement with\n Knowledge Graph","summary":" Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)\nbecause it reveals the relations among diseases and thus can be utilized to\nguide the generation process. However, constructing a comprehensive KG is\nlabor-intensive and its applications on the MRG process are under-explored. In\nthis study, we establish a complete KG on chest X-ray imaging that includes 137\ntypes of diseases and abnormalities. Based on this KG, we find that the current\nMRG data sets exhibit a long-tailed problem in disease distribution. To\nmitigate this problem, we introduce a novel augmentation strategy that enhances\nthe representation of disease types in the tail-end of the distribution. We\nfurther design a two-stage MRG approach, where a classifier is first trained to\ndetect whether the input images exhibit any abnormalities. The classified\nimages are then independently fed into two transformer-based generators,\nnamely, ``disease-specific generator\" and ``disease-free generator\" to generate\nthe corresponding reports. To enhance the clinical evaluation of whether the\ngenerated reports correctly describe the diseases appearing in the input image,\nwe propose diverse sensitivity (DS), a new metric that checks whether generated\ndiseases match ground truth and measures the diversity of all generated\ndiseases. Results show that the proposed two-stage generation framework and\naugmentation strategies improve DS by a considerable margin, indicating a\nnotable reduction in the long-tailed problem associated with under-represented\ndiseases.\n","authors":["Yixin Wang","Zihao Lin","Haoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2307.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12517v1","updated":"2023-07-24T04:21:51Z","published":"2023-07-24T04:21:51Z","title":"Entropy Transformer Networks: A Learning Approach via Tangent Bundle\n Data Manifold","summary":" This paper focuses on an accurate and fast interpolation approach for image\ntransformation employed in the design of CNN architectures. Standard Spatial\nTransformer Networks (STNs) use bilinear or linear interpolation as their\ninterpolation, with unrealistic assumptions about the underlying data\ndistributions, which leads to poor performance under scale variations.\nMoreover, STNs do not preserve the norm of gradients in propagation due to\ntheir dependency on sparse neighboring pixels. To address this problem, a novel\nEntropy STN (ESTN) is proposed that interpolates on the data manifold\ndistributions. In particular, random samples are generated for each pixel in\nassociation with the tangent space of the data manifold and construct a linear\napproximation of their intensity values with an entropy regularizer to compute\nthe transformer parameters. A simple yet effective technique is also proposed\nto normalize the non-zero values of the convolution operation, to fine-tune the\nlayers for gradients' norm-regularization during training. Experiments on\nchallenging benchmarks show that the proposed ESTN can improve predictive\naccuracy over a range of computer vision tasks, including image reconstruction,\nand classification, while reducing the computational cost.\n","authors":["Pourya Shamsolmoali","Masoumeh Zareapoor"],"pdf_url":"https://arxiv.org/pdf/2307.12517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12539v2","updated":"2023-07-24T04:20:37Z","published":"2023-04-25T03:12:54Z","title":"Text-guided Eyeglasses Manipulation with Spatial Constraints","summary":" Virtual try-on of eyeglasses involves placing eyeglasses of different shapes\nand styles onto a face image without physically trying them on. While existing\nmethods have shown impressive results, the variety of eyeglasses styles is\nlimited and the interactions are not always intuitive or efficient. To address\nthese limitations, we propose a Text-guided Eyeglasses Manipulation method that\nallows for control of the eyeglasses shape and style based on a binary mask and\ntext, respectively. Specifically, we introduce a mask encoder to extract mask\nconditions and a modulation module that enables simultaneous injection of text\nand mask conditions. This design allows for fine-grained control of the\neyeglasses' appearance based on both textual descriptions and spatial\nconstraints. Our approach includes a disentangled mapper and a decoupling\nstrategy that preserves irrelevant areas, resulting in better local editing. We\nemploy a two-stage training scheme to handle the different convergence speeds\nof the various modality conditions, successfully controlling both the shape and\nstyle of eyeglasses. Extensive comparison experiments and ablation analyses\ndemonstrate the effectiveness of our approach in achieving diverse eyeglasses\nstyles while preserving irrelevant areas.\n","authors":["Jiacheng Wang","Ping Liu","Jingen Liu","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2304.12539v2.pdf","comment":"Revised version: add some experiments"},{"id":"http://arxiv.org/abs/2307.11466v2","updated":"2023-07-24T03:35:03Z","published":"2023-07-21T10:02:02Z","title":"MatSpectNet: Material Segmentation Network with Domain-Aware and\n Physically-Constrained Hyperspectral Reconstruction","summary":" Achieving accurate material segmentation for 3-channel RGB images is\nchallenging due to the considerable variation in a material's appearance.\nHyperspectral images, which are sets of spectral measurements sampled at\nmultiple wavelengths, theoretically offer distinct information for material\nidentification, as variations in intensity of electromagnetic radiation\nreflected by a surface depend on the material composition of a scene. However,\nexisting hyperspectral datasets are impoverished regarding the number of images\nand material categories for the dense material segmentation task, and\ncollecting and annotating hyperspectral images with a spectral camera is\nprohibitively expensive. To address this, we propose a new model, the\nMatSpectNet to segment materials with recovered hyperspectral images from RGB\nimages. The network leverages the principles of colour perception in modern\ncameras to constrain the reconstructed hyperspectral images and employs the\ndomain adaptation method to generalise the hyperspectral reconstruction\ncapability from a spectral recovery dataset to material segmentation datasets.\nThe reconstructed hyperspectral images are further filtered using learned\nresponse curves and enhanced with human perception. The performance of\nMatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces\ndataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase\nin average pixel accuracy and a 3.42% improvement in mean class accuracy\ncompared with the most recent publication. The project code is attached to the\nsupplementary material and will be published on GitHub.\n","authors":["Yuwen Heng","Yihong Wu","Jiawen Chen","Srinandan Dasmahapatra","Hansung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.11466v2.pdf","comment":"7 pages main paper"},{"id":"http://arxiv.org/abs/2304.03483v2","updated":"2023-07-24T03:28:34Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n Dynamic Imaging","summary":" Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\nRegularization by Denoising (RED), which provides a flexible framework to\nexploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nthe performance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12502v1","updated":"2023-07-24T03:27:41Z","published":"2023-07-24T03:27:41Z","title":"Cross Contrastive Feature Perturbation for Domain Generalization","summary":" Domain generalization (DG) aims to learn a robust model from source domains\nthat generalize well on unseen target domains. Recent studies focus on\ngenerating novel domain samples or features to diversify distributions\ncomplementary to source domains. Yet, these approaches can hardly deal with the\nrestriction that the samples synthesized from various domains can cause\nsemantic distortion. In this paper, we propose an online one-stage Cross\nContrasting Feature Perturbation (CCFP) framework to simulate domain shift by\ngenerating perturbed features in the latent space while regularizing the model\nprediction against domain shift. Different from the previous fixed synthesizing\nstrategy, we design modules with learnable feature perturbations and semantic\nconsistency constraints. In contrast to prior work, our method does not use any\ngenerative-based models or domain labels. We conduct extensive experiments on a\nstandard DomainBed benchmark with a strict evaluation protocol for a fair\ncomparison. Comprehensive experiments show that our method outperforms the\nprevious state-of-the-art, and quantitative analyses illustrate that our\napproach can alleviate the domain shift problem in out-of-distribution (OOD)\nscenarios.\n","authors":["Chenming Li","Daoan Zhang","Wenjian Huang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.09186v4","updated":"2023-07-24T03:20:19Z","published":"2022-04-20T02:14:20Z","title":"Reconstruction-Aware Prior Distillation for Semi-supervised Point Cloud\n Completion","summary":" Real-world sensors often produce incomplete, irregular, and noisy point\nclouds, making point cloud completion increasingly important. However, most\nexisting completion methods rely on large paired datasets for training, which\nis labor-intensive. This paper proposes RaPD, a novel semi-supervised point\ncloud completion method that reduces the need for paired datasets. RaPD\nutilizes a two-stage training scheme, where a deep semantic prior is learned in\nstage 1 from unpaired complete and incomplete point clouds, and a\nsemi-supervised prior distillation process is introduced in stage 2 to train a\ncompletion network using only a small number of paired samples. Additionally, a\nself-supervised completion module is introduced to improve performance using\nunpaired incomplete point clouds. Experiments on multiple datasets show that\nRaPD outperforms previous methods in both homologous and heterologous\nscenarios.\n","authors":["Zhaoxin Fan","Yulin He","Zhicheng Wang","Kejian Wu","Hongyan Liu","Jun He"],"pdf_url":"https://arxiv.org/pdf/2204.09186v4.pdf","comment":"Accepted to IJCAI 2023"},{"id":"http://arxiv.org/abs/2307.12499v1","updated":"2023-07-24T03:10:02Z","published":"2023-07-24T03:10:02Z","title":"AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion\n Models","summary":" Unrestricted adversarial attacks present a serious threat to deep learning\nmodels and adversarial defense techniques. They pose severe security problems\nfor deep learning applications because they can effectively bypass defense\nmechanisms. However, previous attack methods often utilize Generative\nAdversarial Networks (GANs), which are not theoretically provable and thus\ngenerate unrealistic examples by incorporating adversarial objectives,\nespecially for large-scale datasets like ImageNet. In this paper, we propose a\nnew method, called AdvDiff, to generate unrestricted adversarial examples with\ndiffusion models. We design two novel adversarial guidance techniques to\nconduct adversarial sampling in the reverse generation process of diffusion\nmodels. These two techniques are effective and stable to generate high-quality,\nrealistic adversarial examples by integrating gradients of the target\nclassifier interpretably. Experimental results on MNIST and ImageNet datasets\ndemonstrate that AdvDiff is effective to generate unrestricted adversarial\nexamples, which outperforms GAN-based methods in terms of attack performance\nand generation quality.\n","authors":["Xuelong Dai","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.12499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09417v2","updated":"2023-07-24T03:06:15Z","published":"2022-08-19T16:04:29Z","title":"Target-oriented Sentiment Classification with Sequential Cross-modal\n Semantic Graph","summary":" Multi-modal aspect-based sentiment classification (MABSC) is task of\nclassifying the sentiment of a target entity mentioned in a sentence and an\nimage. However, previous methods failed to account for the fine-grained\nsemantic association between the image and the text, which resulted in limited\nidentification of fine-grained image aspects and opinions. To address these\nlimitations, in this paper we propose a new approach called SeqCSG, which\nenhances the encoder-decoder sentiment classification framework using\nsequential cross-modal semantic graphs. SeqCSG utilizes image captions and\nscene graphs to extract both global and local fine-grained image information\nand considers them as elements of the cross-modal semantic graph along with\ntokens from tweets. The sequential cross-modal semantic graph is represented as\na sequence with a multi-modal adjacency matrix indicating relationships between\nelements. Experimental results show that the approach outperforms existing\nmethods and achieves state-of-the-art performance on two standard datasets.\nFurther analysis has demonstrated that the model can implicitly learn the\ncorrelation between fine-grained information of the image and the text with the\ngiven target. Our code is available at https://github.com/zjukg/SeqCSG.\n","authors":["Yufeng Huang","Zhuo Chen","Jiaoyan Chen","Jeff Z. Pan","Zhen Yao","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.09417v2.pdf","comment":"ICANN 2023, https://github.com/zjukg/SeqCSG"},{"id":"http://arxiv.org/abs/2307.11411v2","updated":"2023-07-24T02:57:01Z","published":"2023-07-21T08:10:26Z","title":"Deep Directly-Trained Spiking Neural Networks for Object Detection","summary":" Spiking neural networks (SNNs) are brain-inspired energy-efficient models\nthat encode information in spatiotemporal dynamics. Recently, deep SNNs trained\ndirectly have shown great success in achieving high performance on\nclassification tasks with very few time steps. However, how to design a\ndirectly-trained SNN for the regression task of object detection still remains\na challenging problem. To address this problem, we propose EMS-YOLO, a novel\ndirectly-trained SNN framework for object detection, which is the first trial\nto train a deep SNN with surrogate gradients for object detection rather than\nANN-SNN conversion strategies. Specifically, we design a full-spike residual\nblock, EMS-ResNet, which can effectively extend the depth of the\ndirectly-trained SNN with low power consumption. Furthermore, we theoretically\nanalyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.\nThe results demonstrate that our approach outperforms the state-of-the-art\nANN-SNN conversion methods (at least 500 time steps) in extremely fewer time\nsteps (only 4 time steps). It is shown that our model could achieve comparable\nperformance to the ANN with the same architecture while consuming 5.83 times\nless energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.\n","authors":["Qiaoyi Su","Yuhong Chou","Yifan Hu","Jianing Li","Shijie Mei","Ziyang Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2307.11411v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.12493v1","updated":"2023-07-24T02:50:44Z","published":"2023-07-24T02:50:44Z","title":"TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition","summary":" Text-driven diffusion models have exhibited impressive generative\ncapabilities, enabling various image editing tasks. In this paper, we propose\nTF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the\npower of text-driven diffusion models for cross-domain image-guided\ncomposition. This task aims to seamlessly integrate user-provided objects into\na specific visual context. Current diffusion-based methods often involve costly\ninstance-based optimization or finetuning of pretrained models on customized\ndatasets, which can potentially undermine their rich prior. In contrast,\nTF-ICON can leverage off-the-shelf diffusion models to perform cross-domain\nimage-guided composition without requiring additional training, finetuning, or\noptimization. Moreover, we introduce the exceptional prompt, which contains no\ninformation, to facilitate text-driven diffusion models in accurately inverting\nreal images into latent representations, forming the basis for compositing. Our\nexperiments show that equipping Stable Diffusion with the exceptional prompt\noutperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,\nCOCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile\nvisual domains. Code is available at https://github.com/Shilin-LU/TF-ICON\n","authors":["Shilin Lu","Yanzhu Liu","Adams Wai-Kin Kong"],"pdf_url":"https://arxiv.org/pdf/2307.12493v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.00932v2","updated":"2023-07-24T01:57:52Z","published":"2023-07-03T11:13:28Z","title":"A large calcium-imaging dataset reveals a systematic V4 organization for\n natural scenes","summary":" The visual system evolved to process natural scenes, yet most of our\nunderstanding of the topology and function of visual cortex derives from\nstudies using artificial stimuli. To gain deeper insights into visual\nprocessing of natural scenes, we utilized widefield calcium-imaging of primate\nV4 in response to many natural images, generating a large dataset of\ncolumnar-scale responses. We used this dataset to build a digital twin of V4\nvia deep learning, generating a detailed topographical map of natural image\npreferences at each cortical position. The map revealed clustered functional\ndomains for specific classes of natural image features. These ranged from\nsurface-related attributes like color and texture to shape-related features\nsuch as edges, curvature, and facial features. We validated the model-predicted\ndomains with additional widefield calcium-imaging and single-cell resolution\ntwo-photon imaging. Our study illuminates the detailed topological organization\nand neural codes in V4 that represent natural scenes.\n","authors":["Tianye Wang","Haoxuan Yao","Tai Sing Lee","Jiayi Hong","Yang Li","Hongfei Jiang","Ian Max Andolina","Shiming Tang"],"pdf_url":"https://arxiv.org/pdf/2307.00932v2.pdf","comment":"39 pages, 14 figures"},{"id":"http://arxiv.org/abs/2305.01788v3","updated":"2023-07-24T00:54:51Z","published":"2023-05-02T21:33:10Z","title":"Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation\n Incorporating Gloss Information","summary":" Visual Word Sense Disambiguation (VWSD) is a task to find the image that most\naccurately depicts the correct sense of the target word for the given context.\nPreviously, image-text matching models often suffered from recognizing\npolysemous words. This paper introduces an unsupervised VWSD approach that uses\ngloss information of an external lexical knowledge-base, especially the sense\ndefinitions. Specifically, we suggest employing Bayesian inference to\nincorporate the sense definitions when sense information of the answer is not\nprovided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we\npropose a context-aware definition generation with GPT-3. Experimental results\nshow that the VWSD performance significantly increased with our Bayesian\ninference-based approach. In addition, our context-aware definition generation\nachieved prominent performance improvement in OOD examples exhibiting better\nperformance than the existing definition generation method.\n","authors":["Sunjae Kwon","Rishabh Garodia","Minhwa Lee","Zhichao Yang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2305.01788v3.pdf","comment":"ACL 2023, https://aclanthology.org/2023.acl-long.88"},{"id":"http://arxiv.org/abs/2307.12463v1","updated":"2023-07-24T00:53:46Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":" Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.07916v2","updated":"2023-07-24T00:29:45Z","published":"2023-04-16T23:37:24Z","title":"GaitRef: Gait Recognition with Refined Sequential Skeletons","summary":" Identifying humans with their walking sequences, known as gait recognition,\nis a useful biometric understanding task as it can be observed from a long\ndistance and does not require cooperation from the subject. Two common\nmodalities used for representing the walking sequence of a person are\nsilhouettes and joint skeletons. Silhouette sequences, which record the\nboundary of the walking person in each frame, may suffer from the variant\nappearances from carried-on objects and clothes of the person. Framewise joint\ndetections are noisy and introduce some jitters that are not consistent with\nsequential detections. In this paper, we combine the silhouettes and skeletons\nand refine the framewise joint predictions for gait recognition. With temporal\ninformation from the silhouette sequences. We show that the refined skeletons\ncan improve gait recognition performance without extra annotations. We compare\nour methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show\nstate-of-the-art performance.\n","authors":["Haidong Zhu","Wanrong Zheng","Zhaoheng Zheng","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2304.07916v2.pdf","comment":"IJCB 2023. Code is available at\n https://github.com/haidongz-usc/GaitRef"},{"id":"http://arxiv.org/abs/2307.12459v1","updated":"2023-07-24T00:03:09Z","published":"2023-07-24T00:03:09Z","title":"Robust face anti-spoofing framework with Convolutional Vision\n Transformer","summary":" Owing to the advances in image processing technology and large-scale\ndatasets, companies have implemented facial authentication processes, thereby\nstimulating increased focus on face anti-spoofing (FAS) against realistic\npresentation attacks. Recently, various attempts have been made to improve face\nrecognition performance using both global and local learning on face images;\nhowever, to the best of our knowledge, this is the first study to investigate\nwhether the robustness of FAS against domain shifts is improved by considering\nglobal information and local cues in face images captured using self-attention\nand convolutional layers. This study proposes a convolutional vision\ntransformer-based framework that achieves robust performance for various unseen\ndomain data. Our model resulted in 7.3%$p$ and 12.9%$p$ increases in FAS\nperformance compared to models using only a convolutional neural network or\nvision transformer, respectively. It also shows the highest average rank in\nsub-protocols of cross-dataset setting over the other nine benchmark models for\ndomain generalization.\n","authors":["Yunseung Lee","Youngjun Kwak","Jinho Shin"],"pdf_url":"https://arxiv.org/pdf/2307.12459v1.pdf","comment":"ICIP 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.09683v2","updated":"2023-07-24T15:41:03Z","published":"2023-07-18T23:35:53Z","title":"PubMed and Beyond: Recent Advances and Best Practices in Biomedical\n Literature Search","summary":" Biomedical research yields a wealth of information, much of which is only\naccessible through the literature. Consequently, literature search is an\nessential tool for building on prior knowledge in clinical and biomedical\nresearch. Although recent improvements in artificial intelligence have expanded\nfunctionality beyond keyword-based search, these advances may be unfamiliar to\nclinicians and researchers. In response, we present a survey of literature\nsearch tools tailored to both general and specific information needs in\nbiomedicine, with the objective of helping readers efficiently fulfill their\ninformation needs. We first examine the widely used PubMed search engine,\ndiscussing recent improvements and continued challenges. We then describe\nliterature search tools catering to five specific information needs: 1.\nIdentifying high-quality clinical research for evidence-based medicine. 2.\nRetrieving gene-related information for precision medicine and genomics. 3.\nSearching by meaning, including natural language questions. 4. Locating related\narticles with literature recommendation. 5. Mining literature to discover\nassociations between concepts such as diseases and genetic variants.\nAdditionally, we cover practical considerations and best practices for choosing\nand using these tools. Finally, we provide a perspective on the future of\nliterature search engines, considering recent breakthroughs in large language\nmodels such as ChatGPT. In summary, our survey provides a comprehensive view of\nbiomedical literature search functionalities with 36 publicly available tools.\n","authors":["Qiao Jin","Robert Leaman","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.09683v2.pdf","comment":"27 pages, 6 figures, 36 tools"},{"id":"http://arxiv.org/abs/2307.12810v1","updated":"2023-07-24T14:00:07Z","published":"2023-07-24T14:00:07Z","title":"HeteFedRec: Federated Recommender Systems with Model Heterogeneity","summary":" Owing to the nature of privacy protection, federated recommender systems\n(FedRecs) have garnered increasing interest in the realm of on-device\nrecommender systems. However, most existing FedRecs only allow participating\nclients to collaboratively train a recommendation model of the same public\nparameter size. Training a model of the same size for all clients can lead to\nsuboptimal performance since clients possess varying resources. For example,\nclients with limited training data may prefer to train a smaller recommendation\nmodel to avoid excessive data consumption, while clients with sufficient data\nwould benefit from a larger model to achieve higher recommendation accuracy. To\naddress the above challenge, this paper introduces HeteFedRec, a novel FedRec\nframework that enables the assignment of personalized model sizes to\nparticipants. In HeteFedRec, we present a heterogeneous recommendation model\naggregation strategy, including a unified dual-task learning mechanism and a\ndimensional decorrelation regularization, to allow knowledge aggregation among\nrecommender models of different sizes. Additionally, a relation-based ensemble\nknowledge distillation method is proposed to effectively distil knowledge from\nheterogeneous item embeddings. Extensive experiments conducted on three\nreal-world recommendation datasets demonstrate the effectiveness and efficiency\nof HeteFedRec in training federated recommender systems under heterogeneous\nsettings.\n","authors":["Wei Yuan","Liang Qu","Lizhen Cui","Yongxin Tong","Xiaofang Zhou","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2307.12810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12798v1","updated":"2023-07-24T13:51:19Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":" The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12756v1","updated":"2023-07-24T12:58:47Z","published":"2023-07-24T12:58:47Z","title":"Unbiased Delayed Feedback Label Correction for Conversion Rate\n Prediction","summary":" Conversion rate prediction is critical to many online applications such as\ndigital display advertising. To capture dynamic data distribution, industrial\nsystems often require retraining models on recent data daily or weekly.\nHowever, the delay of conversion behavior usually leads to incorrect labeling,\nwhich is called delayed feedback problem. Existing work may fail to introduce\nthe correct information about false negative samples due to data sparsity and\ndynamic data distribution. To directly introduce the correct feedback label\ninformation, we propose an Unbiased delayed feedback Label Correction framework\n(ULC), which uses an auxiliary model to correct labels for observed negative\nfeedback samples. Firstly, we theoretically prove that the label-corrected loss\nis an unbiased estimate of the oracle loss using true labels. Then, as there\nare no ready training data for label correction, counterfactual labeling is\nused to construct artificial training data. Furthermore, since counterfactual\nlabeling utilizes only partial training data, we design an embedding-based\nalternative training method to enhance performance. Comparative experiments on\nboth public and private datasets and detailed analyses show that our proposed\napproach effectively alleviates the delayed feedback problem and consistently\noutperforms the previous state-of-the-art methods.\n","authors":["Yifan Wang","Peijie Sun","Min Zhang","Qinglin Jia","Jingjie Li","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2307.12756v1.pdf","comment":"accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2307.12576v1","updated":"2023-07-24T07:47:21Z","published":"2023-07-24T07:47:21Z","title":"Self-refining of Pseudo Labels for Music Source Separation with Noisy\n Labeled Data","summary":" Music source separation (MSS) faces challenges due to the limited\navailability of correctly-labeled individual instrument tracks. With the push\nto acquire larger datasets to improve MSS performance, the inevitability of\nencountering mislabeled individual instrument tracks becomes a significant\nchallenge to address. This paper introduces an automated technique for refining\nthe labels in a partially mislabeled dataset. Our proposed self-refining\ntechnique, employed with a noisy-labeled dataset, results in only a 1% accuracy\ndegradation in multi-label instrument recognition compared to a classifier\ntrained on a clean-labeled dataset. The study demonstrates the importance of\nrefining noisy-labeled data in MSS model training and shows that utilizing the\nrefined dataset leads to comparable results derived from a clean-labeled\ndataset. Notably, upon only access to a noisy dataset, MSS models trained on a\nself-refined dataset even outperform those trained on a dataset refined with a\nclassifier trained on clean labels.\n","authors":["Junghyun Koo","Yunkee Chae","Chang-Bin Jeon","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12576v1.pdf","comment":"24th International Society for Music Information Retrieval Conference\n (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2307.10617v3","updated":"2023-07-24T07:03:01Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":" In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12518v1","updated":"2023-07-24T04:23:08Z","published":"2023-07-24T04:23:08Z","title":"FaFCNN: A General Disease Classification Framework Based on Feature\n Fusion Neural Networks","summary":" There are two fundamental problems in applying deep learning/machine learning\nmethods to disease classification tasks, one is the insufficient number and\npoor quality of training samples; another one is how to effectively fuse\nmultiple source features and thus train robust classification models. To\naddress these problems, inspired by the process of human learning knowledge, we\npropose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which\nintroduces a feature-aware interaction module and a feature alignment module\nbased on domain adversarial learning. This is a general framework for disease\nclassification, and FaFCNN improves the way existing methods obtain sample\ncorrelation features. The experimental results show that training using\naugmented features obtained by pre-training gradient boosting decision tree\nyields more performance gains than random-forest based methods. On the\nlow-quality dataset with a large amount of missing data in our setup, FaFCNN\nobtains a consistently optimal performance compared to competitive baselines.\nIn addition, extensive experiments demonstrate the robustness of the proposed\nmethod and the effectiveness of each component of the model\\footnote{Accepted\nin IEEE SMC2023}.\n","authors":["Menglin Kong","Shaojie Zhao","Juan Cheng","Xingquan Li","Ri Su","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12518v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.12983v1","updated":"2023-07-24T17:59:37Z","published":"2023-07-24T17:59:37Z","title":"Parallel $Q$-Learning: Scaling Off-policy Reinforcement Learning under\n Massively Parallel Simulation","summary":" Reinforcement learning is time-consuming for complex tasks due to the need\nfor large amounts of training data. Recent advances in GPU-based simulation,\nsuch as Isaac Gym, have sped up data collection thousands of times on a\ncommodity GPU. Most prior works used on-policy methods like PPO due to their\nsimplicity and ease of scaling. Off-policy methods are more data efficient but\nchallenging to scale, resulting in a longer wall-clock training time. This\npaper presents a Parallel $Q$-Learning (PQL) scheme that outperforms PPO in\nwall-clock time while maintaining superior sample efficiency of off-policy\nlearning. PQL achieves this by parallelizing data collection, policy learning,\nand value learning. Different from prior works on distributed off-policy\nlearning, such as Apex, our scheme is designed specifically for massively\nparallel GPU-based simulation and optimized to work on a single workstation. In\nexperiments, we demonstrate that $Q$-learning can be scaled to \\textit{tens of\nthousands of parallel environments} and investigate important factors affecting\nlearning speed. The code is available at https://github.com/Improbable-AI/pql.\n","authors":["Zechu Li","Tao Chen","Zhang-Wei Hong","Anurag Ajay","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2307.12983v1.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":" Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2303.06147v2","updated":"2023-07-24T17:58:45Z","published":"2023-03-10T18:59:57Z","title":"Exphormer: Sparse Transformers for Graphs","summary":" Graph transformers have emerged as a promising architecture for a variety of\ngraph learning and representation tasks. Despite their successes, though, it\nremains challenging to scale graph transformers to large graphs while\nmaintaining accuracy competitive with message-passing networks. In this paper,\nwe introduce Exphormer, a framework for building powerful and scalable graph\ntransformers. Exphormer consists of a sparse attention mechanism based on two\nmechanisms: virtual global nodes and expander graphs, whose mathematical\ncharacteristics, such as spectral expansion, pseduorandomness, and sparsity,\nyield graph transformers with complexity only linear in the size of the graph,\nwhile allowing us to prove desirable theoretical properties of the resulting\ntransformer models. We show that incorporating Exphormer into the\nrecently-proposed GraphGPS framework produces models with competitive empirical\nresults on a wide variety of graph datasets, including state-of-the-art results\non three datasets. We also show that Exphormer can scale to datasets on larger\ngraphs than shown in previous graph transformer architectures. Code can be\nfound at \\url{https://github.com/hamed1375/Exphormer}.\n","authors":["Hamed Shirzad","Ameya Velingker","Balaji Venkatachalam","Danica J. Sutherland","Ali Kemal Sinop"],"pdf_url":"https://arxiv.org/pdf/2303.06147v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05407v3","updated":"2023-07-24T17:58:31Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":" Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen unknown categories into instances, without any prior knowledge about\nthem, while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12979v1","updated":"2023-07-24T17:56:58Z","published":"2023-07-24T17:56:58Z","title":"An Isometric Stochastic Optimizer","summary":" The Adam optimizer is the standard choice in deep learning applications. I\npropose a simple explanation of Adam's success: it makes each parameter's step\nsize independent of the norms of the other parameters. Based on this principle\nI derive Iso, a new optimizer which makes the norm of a parameter's update\ninvariant to the application of any linear transformation to its inputs and\noutputs. I develop a variant of Iso called IsoAdam that allows optimal\nhyperparameters to be transferred from Adam, and demonstrate that IsoAdam\nobtains a speedup over Adam when training a small Transformer.\n","authors":["Jacob Jackson"],"pdf_url":"https://arxiv.org/pdf/2307.12979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12975v1","updated":"2023-07-24T17:50:24Z","published":"2023-07-24T17:50:24Z","title":"Provable Benefits of Policy Learning from Human Preferences in\n Contextual Bandit Problems","summary":" A crucial task in decision-making problems is reward engineering. It is\ncommon in practice that no obvious choice of reward function exists. Thus, a\npopular approach is to introduce human feedback during training and leverage\nsuch feedback to learn a reward function. Among all policy learning methods\nthat use human feedback, preference-based methods have demonstrated substantial\nsuccess in recent empirical applications such as InstructGPT. In this work, we\ndevelop a theory that provably shows the benefits of preference-based methods\nin offline contextual bandits. In particular, we improve the modeling and\nsuboptimality analysis for running policy learning methods on human-scored\nsamples directly. Then, we compare it with the suboptimality guarantees of\npreference-based methods and show that preference-based methods enjoy lower\nsuboptimality.\n","authors":["Xiang Ji","Huazheng Wang","Minshuo Chen","Tuo Zhao","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12971v1","updated":"2023-07-24T17:49:05Z","published":"2023-07-24T17:49:05Z","title":"Big Data - Supply Chain Management Framework for Forecasting: Data\n Preprocessing and Machine Learning Techniques","summary":" This article intends to systematically identify and comparatively analyze\nstate-of-the-art supply chain (SC) forecasting strategies and technologies. A\nnovel framework has been proposed incorporating Big Data Analytics in SC\nManagement (problem identification, data sources, exploratory data analysis,\nmachine-learning model training, hyperparameter tuning, performance evaluation,\nand optimization), forecasting effects on human-workforce, inventory, and\noverall SC. Initially, the need to collect data according to SC strategy and\nhow to collect them has been discussed. The article discusses the need for\ndifferent types of forecasting according to the period or SC objective. The SC\nKPIs and the error-measurement systems have been recommended to optimize the\ntop-performing model. The adverse effects of phantom inventory on forecasting\nand the dependence of managerial decisions on the SC KPIs for determining model\nperformance parameters and improving operations management, transparency, and\nplanning efficiency have been illustrated. The cyclic connection within the\nframework introduces preprocessing optimization based on the post-process KPIs,\noptimizing the overall control process (inventory management, workforce\ndetermination, cost, production and capacity planning). The contribution of\nthis research lies in the standard SC process framework proposal, recommended\nforecasting data analysis, forecasting effects on SC performance, machine\nlearning algorithms optimization followed, and in shedding light on future\nresearch.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","Jungpil Shin","Istiyaque Ahmed Ridoy","Yoichi Tomioka","M. F. Mridha"],"pdf_url":"https://arxiv.org/pdf/2307.12971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12968v1","updated":"2023-07-24T17:46:32Z","published":"2023-07-24T17:46:32Z","title":"A Connection between One-Step Regularization and Critic Regularization\n in Reinforcement Learning","summary":" As with any machine learning problem with limited data, effective offline RL\nalgorithms require careful regularization to avoid overfitting. One-step\nmethods perform regularization by doing just a single step of policy\nimprovement, while critic regularization methods do many steps of policy\nimprovement with a regularized objective. These methods appear distinct.\nOne-step methods, such as advantage-weighted regression and conditional\nbehavioral cloning, truncate policy iteration after just one step. This ``early\nstopping'' makes one-step RL simple and stable, but can limit its asymptotic\nperformance. Critic regularization typically requires more compute but has\nappealing lower-bound guarantees. In this paper, we draw a close connection\nbetween these methods: applying a multi-step critic regularization method with\na regularization coefficient of 1 yields the same policy as one-step RL. While\npractical implementations violate our assumptions and critic regularization is\ntypically applied with smaller regularization coefficients, our experiments\nnevertheless show that our analysis makes accurate, testable predictions about\npractical offline RL methods (CQL and one-step RL) with commonly-used\nhyperparameters. Our results that every problem can be solved with a single\nstep of policy improvement, but rather that one-step RL might be competitive\nwith critic regularization on RL problems that demand strong regularization.\n","authors":["Benjamin Eysenbach","Matthieu Geist","Sergey Levine","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2307.12968v1.pdf","comment":"Accepted to ICML 2023. Video\n (https://www.youtube.com/watch?v=1xlixIHZ0R4) and code\n (https://github.com/ben-eysenbach/ac-connection)"},{"id":"http://arxiv.org/abs/2307.12967v1","updated":"2023-07-24T17:45:40Z","published":"2023-07-24T17:45:40Z","title":"Learning Dense Correspondences between Photos and Sketches","summary":" Humans effortlessly grasp the connection between sketches and real-world\nobjects, even when these sketches are far from realistic. Moreover, human\nsketch understanding goes beyond categorization -- critically, it also entails\nunderstanding how individual elements within a sketch correspond to parts of\nthe physical world it represents. What are the computational ingredients needed\nto support this ability? Towards answering this question, we make two\ncontributions: first, we introduce a new sketch-photo correspondence benchmark,\n$\\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across\n125 object categories, augmenting the existing Sketchy dataset with\nfine-grained correspondence metadata. Second, we propose a self-supervised\nmethod for learning dense correspondences between sketch-photo pairs, building\nupon recent advances in correspondence learning for pairs of photos. Our model\nuses a spatial transformer network to estimate the warp flow between latent\nrepresentations of a sketch and photo extracted by a contrastive learning-based\nConvNet backbone. We found that this approach outperformed several strong\nbaselines and produced predictions that were quantitatively consistent with\nother warp-based methods. However, our benchmark also revealed systematic\ndifferences between predictions of the suite of models we tested and those of\nhumans. Taken together, our work suggests a promising path towards developing\nartificial systems that achieve more human-like understanding of visual images\nat different levels of abstraction. Project page:\nhttps://photo-sketch-correspondence.github.io\n","authors":["Xuanchen Lu","Xiaolong Wang","Judith E Fan"],"pdf_url":"https://arxiv.org/pdf/2307.12967v1.pdf","comment":"Accepted to ICML 2023. Project page:\n https://photo-sketch-correspondence.github.io"},{"id":"http://arxiv.org/abs/2303.04245v2","updated":"2023-07-24T17:29:04Z","published":"2023-03-07T21:42:17Z","title":"How Do Transformers Learn Topic Structure: Towards a Mechanistic\n Understanding","summary":" While the successes of transformers across many domains are indisputable,\naccurate understanding of the learning mechanics is still largely lacking.\nTheir capabilities have been probed on benchmarks which include a variety of\nstructured and reasoning tasks -- but mathematical understanding is lagging\nsubstantially behind. Recent lines of work have begun studying representational\naspects of this question: that is, the size/depth/complexity of attention-based\nnetworks to perform certain tasks. However, there is no guarantee the learning\ndynamics will converge to the constructions proposed. In our paper, we provide\nfine-grained mechanistic understanding of how transformers learn \"semantic\nstructure\", understood as capturing co-occurrence structure of words.\nPrecisely, we show, through a combination of mathematical analysis and\nexperiments on Wikipedia data and synthetic data modeled by Latent Dirichlet\nAllocation (LDA), that the embedding layer and the self-attention layer encode\nthe topical structure. In the former case, this manifests as higher average\ninner product of embeddings between same-topic words. In the latter, it\nmanifests as higher average pairwise attention between same-topic words. The\nmathematical results involve several assumptions to make the analysis\ntractable, which we verify on data, and might be of independent interest as\nwell.\n","authors":["Yuchen Li","Yuanzhi Li","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2303.04245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12943v1","updated":"2023-07-24T17:15:38Z","published":"2023-07-24T17:15:38Z","title":"Efficiently Sampling the PSD Cone with the Metric Dikin Walk","summary":" Semi-definite programs represent a frontier of efficient computation. While\nthere has been much progress on semi-definite optimization, with moderate-sized\ninstances currently solvable in practice by the interior-point method, the\nbasic problem of sampling semi-definite solutions remains a formidable\nchallenge. The direct application of known polynomial-time algorithms for\nsampling general convex bodies to semi-definite sampling leads to a\nprohibitively high running time. In addition, known general methods require an\nexpensive rounding phase as pre-processing. Here we analyze the Dikin walk, by\nfirst adapting it to general metrics, then devising suitable metrics for the\nPSD cone with affine constraints. The resulting mixing time and per-step\ncomplexity are considerably smaller, and by an appropriate choice of the\nmetric, the dependence on the number of constraints can be made\npolylogarithmic. We introduce a refined notion of self-concordant matrix\nfunctions and give rules for combining different metrics. Along the way, we\nfurther develop the theory of interior-point methods for sampling.\n","authors":["Yunbum Kook","Santosh S. Vempala"],"pdf_url":"https://arxiv.org/pdf/2307.12943v1.pdf","comment":"54 pages"},{"id":"http://arxiv.org/abs/2307.12941v1","updated":"2023-07-24T17:11:39Z","published":"2023-07-24T17:11:39Z","title":"On Privileged and Convergent Bases in Neural Network Representations","summary":" In this study, we investigate whether the representations learned by neural\nnetworks possess a privileged and convergent basis. Specifically, we examine\nthe significance of feature directions represented by individual neurons.\nFirst, we establish that arbitrary rotations of neural representations cannot\nbe inverted (unlike linear networks), indicating that they do not exhibit\ncomplete rotational invariance. Subsequently, we explore the possibility of\nmultiple bases achieving identical performance. To do this, we compare the\nbases of networks trained with the same parameters but with varying random\ninitializations. Our study reveals two findings: (1) Even in wide networks such\nas WideResNets, neural networks do not converge to a unique basis; (2) Basis\ncorrelation increases significantly when a few early layers of the network are\nfrozen identically.\n Furthermore, we analyze Linear Mode Connectivity, which has been studied as a\nmeasure of basis correlation. Our findings give evidence that while Linear Mode\nConnectivity improves with increased network width, this improvement is not due\nto an increase in basis correlation.\n","authors":["Davis Brown","Nikhil Vyas","Yamini Bansal"],"pdf_url":"https://arxiv.org/pdf/2307.12941v1.pdf","comment":"In the Workshop on High-dimensional Learning Dynamics at ICML 2023"},{"id":"http://arxiv.org/abs/2307.08572v3","updated":"2023-07-24T17:01:50Z","published":"2023-07-17T15:38:11Z","title":"Revisiting the Robustness of the Minimum Error Entropy Criterion: A\n Transfer Learning Case Study","summary":" Coping with distributional shifts is an important part of transfer learning\nmethods in order to perform well in real-life tasks. However, most of the\nexisting approaches in this area either focus on an ideal scenario in which the\ndata does not contain noises or employ a complicated training paradigm or model\ndesign to deal with distributional shifts. In this paper, we revisit the\nrobustness of the minimum error entropy (MEE) criterion, a widely used\nobjective in statistical signal processing to deal with non-Gaussian noises,\nand investigate its feasibility and usefulness in real-life transfer learning\nregression tasks, where distributional shifts are common. Specifically, we put\nforward a new theoretical result showing the robustness of MEE against\ncovariate shift. We also show that by simply replacing the mean squared error\n(MSE) loss with the MEE on basic transfer learning algorithms such as\nfine-tuning and linear probing, we can achieve competitive performance with\nrespect to state-of-the-art transfer learning algorithms. We justify our\narguments on both synthetic data and 5 real-world time-series data.\n","authors":["Luis Pedro Silvestrin","Shujian Yu","Mark Hoogendoorn"],"pdf_url":"https://arxiv.org/pdf/2307.08572v3.pdf","comment":"Manuscript accepted at ECAI-23. Code available at\n https://github.com/lpsilvestrin/mee-finetune"},{"id":"http://arxiv.org/abs/2307.12926v1","updated":"2023-07-24T16:36:04Z","published":"2023-07-24T16:36:04Z","title":"Contextual Bandits and Imitation Learning via Preference-Based Active\n Queries","summary":" We consider the problem of contextual bandits and imitation learning, where\nthe learner lacks direct knowledge of the executed action's reward. Instead,\nthe learner can actively query an expert at each round to compare two actions\nand receive noisy preference feedback. The learner's objective is two-fold: to\nminimize the regret associated with the executed actions, while simultaneously,\nminimizing the number of comparison queries made to the expert. In this paper,\nwe assume that the learner has access to a function class that can represent\nthe expert's preference model under appropriate link functions, and provide an\nalgorithm that leverages an online regression oracle with respect to this\nfunction class for choosing its actions and deciding when to query. For the\ncontextual bandit setting, our algorithm achieves a regret bound that combines\nthe best of both worlds, scaling as $O(\\min\\{\\sqrt{T}, d/\\Delta\\})$, where $T$\nrepresents the number of interactions, $d$ represents the eluder dimension of\nthe function class, and $\\Delta$ represents the minimum preference of the\noptimal action over any suboptimal action under all contexts. Our algorithm\ndoes not require the knowledge of $\\Delta$, and the obtained regret bound is\ncomparable to what can be achieved in the standard contextual bandits setting\nwhere the learner observes reward signals at each round. Additionally, our\nalgorithm makes only $O(\\min\\{T, d^2/\\Delta^2\\})$ queries to the expert. We\nthen extend our algorithm to the imitation learning setting, where the learning\nagent engages with an unknown environment in episodes of length $H$ each, and\nprovide similar guarantees for regret and query complexity. Interestingly, our\nalgorithm for imitation learning can even learn to outperform the underlying\nexpert, when it is suboptimal, highlighting a practical benefit of\npreference-based feedback in imitation learning.\n","authors":["Ayush Sekhari","Karthik Sridharan","Wen Sun","Runzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2307.12926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12231v2","updated":"2023-07-24T16:00:37Z","published":"2023-04-24T16:18:22Z","title":"An Approximation Theory for Metric Space-Valued Functions With A View\n Towards Deep Learning","summary":" Motivated by the developing mathematics of deep learning, we build universal\nfunctions approximators of continuous maps between arbitrary Polish metric\nspaces $\\mathcal{X}$ and $\\mathcal{Y}$ using elementary functions between\nEuclidean spaces as building blocks. Earlier results assume that the target\nspace $\\mathcal{Y}$ is a topological vector space. We overcome this limitation\nby ``randomization'': our approximators output discrete probability measures\nover $\\mathcal{Y}$. When $\\mathcal{X}$ and $\\mathcal{Y}$ are Polish without\nadditional structure, we prove very general qualitative guarantees; when they\nhave suitable combinatorial structure, we prove quantitative guarantees for\nH\\\"{o}lder-like maps, including maps between finite graphs, solution operators\nto rough differential equations between certain Carnot groups, and continuous\nnon-linear operators between Banach spaces arising in inverse problems. In\nparticular, we show that the required number of Dirac measures is determined by\nthe combinatorial structure of $\\mathcal{X}$ and $\\mathcal{Y}$. For barycentric\n$\\mathcal{Y}$, including Banach spaces, $\\mathbb{R}$-trees, Hadamard manifolds,\nor Wasserstein spaces on Polish metric spaces, our approximators reduce to\n$\\mathcal{Y}$-valued functions. When the Euclidean approximators are neural\nnetworks, our constructions generalize transformer networks, providing a new\nprobabilistic viewpoint of geometric deep learning.\n","authors":["Anastasis Kratsios","Chong Liu","Matti Lassas","Maarten V. de Hoop","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2304.12231v2.pdf","comment":"14 Figures, 3 Tables, 78 Pages (Main 40, Proofs 26, Acknowledgments\n and References 12)"},{"id":"http://arxiv.org/abs/2307.12906v1","updated":"2023-07-24T15:59:36Z","published":"2023-07-24T15:59:36Z","title":"QAmplifyNet: Pushing the Boundaries of Supply Chain Backorder Prediction\n Using Interpretable Hybrid Quantum - Classical Neural Network","summary":" Supply chain management relies on accurate backorder prediction for\noptimizing inventory control, reducing costs, and enhancing customer\nsatisfaction. However, traditional machine-learning models struggle with\nlarge-scale datasets and complex relationships, hindering real-world data\ncollection. This research introduces a novel methodological framework for\nsupply chain backorder prediction, addressing the challenge of handling large\ndatasets. Our proposed model, QAmplifyNet, employs quantum-inspired techniques\nwithin a quantum-classical neural network to predict backorders effectively on\nshort and imbalanced datasets. Experimental evaluations on a benchmark dataset\ndemonstrate QAmplifyNet's superiority over classical models, quantum ensembles,\nquantum neural networks, and deep reinforcement learning. Its proficiency in\nhandling short, imbalanced datasets makes it an ideal solution for supply chain\nmanagement. To enhance model interpretability, we use Explainable Artificial\nIntelligence techniques. Practical implications include improved inventory\ncontrol, reduced backorders, and enhanced operational efficiency. QAmplifyNet\nseamlessly integrates into real-world supply chain management systems, enabling\nproactive decision-making and efficient resource allocation. Future work\ninvolves exploring additional quantum-inspired techniques, expanding the\ndataset, and investigating other supply chain applications. This research\nunlocks the potential of quantum computing in supply chain optimization and\npaves the way for further exploration of quantum-inspired machine learning\nmodels in supply chain management. Our framework and QAmplifyNet model offer a\nbreakthrough approach to supply chain backorder prediction, providing superior\nperformance and opening new avenues for leveraging quantum-inspired techniques\nin supply chain management.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","Md. Saiful Islam","Jungpil Shin","M. F. Mridha","Yuichi Okuyama"],"pdf_url":"https://arxiv.org/pdf/2307.12906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12904v1","updated":"2023-07-24T15:52:33Z","published":"2023-07-24T15:52:33Z","title":"Universal Approximation Theorem and error bounds for quantum neural\n networks and quantum reservoirs","summary":" Universal approximation theorems are the foundations of classical neural\nnetworks, providing theoretical guarantees that the latter are able to\napproximate maps of interest. Recent results have shown that this can also be\nachieved in a quantum setting, whereby classical functions can be approximated\nby parameterised quantum circuits. We provide here precise error bounds for\nspecific classes of functions and extend these results to the interesting new\nsetup of randomised quantum circuits, mimicking classical reservoir neural\nnetworks. Our results show in particular that a quantum neural network with\n$\\mathcal{O}(\\varepsilon^{-2})$ weights and $\\mathcal{O} (\\lceil\n\\log_2(\\varepsilon^{-1}) \\rceil)$ qubits suffices to achieve accuracy\n$\\varepsilon>0$ when approximating functions with integrable Fourier transform.\n","authors":["Lukas Gonon","Antoine Jacquier"],"pdf_url":"https://arxiv.org/pdf/2307.12904v1.pdf","comment":"20 pages, 0 figure"},{"id":"http://arxiv.org/abs/2206.02909v2","updated":"2023-07-24T15:47:59Z","published":"2022-06-06T21:14:01Z","title":"Self-supervised Learning for Human Activity Recognition Using 700,000\n Person-days of Wearable Data","summary":" Advances in deep learning for human activity recognition have been relatively\nlimited due to the lack of large labelled datasets. In this study, we leverage\nself-supervised learning techniques on the UK-Biobank activity tracker\ndataset--the largest of its kind to date--containing more than 700,000\nperson-days of unlabelled wearable sensor data. Our resulting activity\nrecognition model consistently outperformed strong baselines across seven\nbenchmark datasets, with an F1 relative improvement of 2.5%-100% (median\n18.4%), the largest improvements occurring in the smaller datasets. In contrast\nto previous studies, our results generalise across external datasets, devices,\nand environments. Our open-source model will help researchers and developers to\nbuild customisable and generalisable activity classifiers with high\nperformance.\n","authors":["Hang Yuan","Shing Chan","Andrew P. Creagh","Catherine Tong","David A. Clifton","Aiden Doherty"],"pdf_url":"https://arxiv.org/pdf/2206.02909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12897v1","updated":"2023-07-24T15:44:30Z","published":"2023-07-24T15:44:30Z","title":"Anytime Model Selection in Linear Bandits","summary":" Model selection in the context of bandit optimization is a challenging\nproblem, as it requires balancing exploration and exploitation not only for\naction selection, but also for model selection. One natural approach is to rely\non online learning algorithms that treat different models as experts. Existing\nmethods, however, scale poorly ($\\text{poly}M$) with the number of models $M$\nin terms of their regret. Our key insight is that, for model selection in\nlinear bandits, we can emulate full-information feedback to the online learner\nwith a favorable bias-variance trade-off. This allows us to develop ALEXP,\nwhich has an exponentially improved ($\\log M$) dependence on $M$ for its\nregret. ALEXP has anytime guarantees on its regret, and neither requires\nknowledge of the horizon $n$, nor relies on an initial purely exploratory\nstage. Our approach utilizes a novel time-uniform analysis of the Lasso,\nestablishing a new connection between online learning and high-dimensional\nstatistics.\n","authors":["Parnian Kassraie","Aldo Pacchiano","Nicolas Emmenegger","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2307.12897v1.pdf","comment":"37 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12892v1","updated":"2023-07-24T15:42:33Z","published":"2023-07-24T15:42:33Z","title":"A Statistical View of Column Subset Selection","summary":" We consider the problem of selecting a small subset of representative\nvariables from a large dataset. In the computer science literature, this\ndimensionality reduction problem is typically formalized as Column Subset\nSelection (CSS). Meanwhile, the typical statistical formalization is to find an\ninformation-maximizing set of Principal Variables. This paper shows that these\ntwo approaches are equivalent, and moreover, both can be viewed as maximum\nlikelihood estimation within a certain semi-parametric model. Using these\nconnections, we show how to efficiently (1) perform CSS using only summary\nstatistics from the original dataset; (2) perform CSS in the presence of\nmissing and/or censored data; and (3) select the subset size for CSS in a\nhypothesis testing framework.\n","authors":["Anav Sood","Trevor Hastie"],"pdf_url":"https://arxiv.org/pdf/2307.12892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08649v3","updated":"2023-07-24T15:33:25Z","published":"2023-04-17T22:53:54Z","title":"Classification of US Supreme Court Cases using BERT-Based Techniques","summary":" Models based on bidirectional encoder representations from transformers\n(BERT) produce state of the art (SOTA) results on many natural language\nprocessing (NLP) tasks such as named entity recognition (NER), part-of-speech\n(POS) tagging etc. An interesting phenomenon occurs when classifying long\ndocuments such as those from the US supreme court where BERT-based models can\nbe considered difficult to use on a first-pass or out-of-the-box basis. In this\npaper, we experiment with several BERT-based classification techniques for US\nsupreme court decisions or supreme court database (SCDB) and compare them with\nthe previous SOTA results. We then compare our results specifically with SOTA\nmodels for long documents. We compare our results for two classification tasks:\n(1) a broad classification task with 15 categories and (2) a fine-grained\nclassification task with 279 categories. Our best result produces an accuracy\nof 80\\% on the 15 broad categories and 60\\% on the fine-grained 279 categories\nwhich marks an improvement of 8\\% and 28\\% respectively from previously\nreported SOTA results.\n","authors":["Shubham Vatsal","Adam Meyers","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2304.08649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.13628v2","updated":"2023-07-24T15:31:05Z","published":"2021-08-31T05:38:36Z","title":"Learning Optimal Prescriptive Trees from Observational Data","summary":" We consider the problem of learning an optimal prescriptive tree (i.e., an\ninterpretable treatment assignment policy in the form of a binary tree) of\nmoderate depth, from observational data. This problem arises in numerous\nsocially important domains such as public health and personalized medicine,\nwhere interpretable and data-driven interventions are sought based on data\ngathered in deployment -- through passive collection of data -- rather than\nfrom randomized trials. We propose a method for learning optimal prescriptive\ntrees using mixed-integer optimization (MIO) technology. We show that under\nmild conditions our method is asymptotically exact in the sense that it\nconverges to an optimal out-of-sample treatment assignment policy as the number\nof historical data samples tends to infinity. Contrary to existing literature,\nour approach: 1) does not require data to be randomized, 2) does not impose\nstringent assumptions on the learned trees, and 3) has the ability to model\ndomain specific constraints. Through extensive computational experiments, we\ndemonstrate that our asymptotic guarantees translate to significant performance\nimprovements in finite samples, as well as showcase our uniquely flexible\nmodeling power by incorporating budget and fairness constraints.\n","authors":["Nathanael Jo","Sina Aghaei","Andrés Gómez","Phebe Vayanos"],"pdf_url":"https://arxiv.org/pdf/2108.13628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11389v3","updated":"2023-07-24T15:28:34Z","published":"2022-08-24T09:26:12Z","title":"Approximate blocked Gibbs sampling for Bayesian neural networks","summary":" In this work, minibatch MCMC sampling for feedforward neural networks is made\nmore feasible. To this end, it is proposed to sample subgroups of parameters\nvia a blocked Gibbs sampling scheme. By partitioning the parameter space,\nsampling is possible irrespective of layer width. It is also possible to\nalleviate vanishing acceptance rates for increasing depth by reducing the\nproposal variance in deeper layers. Increasing the length of a non-convergent\nchain increases the predictive accuracy in classification tasks, so avoiding\nvanishing acceptance rates and consequently enabling longer chain runs have\npractical benefits. Moreover, non-convergent chain realizations aid in the\nquantification of predictive uncertainty. An open problem is how to perform\nminibatch MCMC sampling for feedforward neural networks in the presence of\naugmented data.\n","authors":["Theodore Papamarkou"],"pdf_url":"https://arxiv.org/pdf/2208.11389v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12803v3","updated":"2023-07-24T15:27:16Z","published":"2022-01-30T12:53:51Z","title":"Generalizing similarity in noisy setups: the DIBS phenomenon","summary":" This work uncovers an interplay among data density, noise, and the\ngeneralization ability in similarity learning. We consider Siamese Neural\nNetworks (SNNs), which are the basic form of contrastive learning, and explore\ntwo types of noise that can impact SNNs, Pair Label Noise (PLN) and Single\nLabel Noise (SLN). Our investigation reveals that SNNs exhibit double descent\nbehaviour regardless of the training setup and that it is further exacerbated\nby noise. We demonstrate that the density of data pairs is crucial for\ngeneralization. When SNNs are trained on sparse datasets with the same amount\nof PLN or SLN, they exhibit comparable generalization properties. However, when\nusing dense datasets, PLN cases generalize worse than SLN ones in the\noverparametrized region, leading to a phenomenon we call Density-Induced Break\nof Similarity (DIBS). In this regime, PLN similarity violation becomes\nmacroscopical, corrupting the dataset to the point where complete interpolation\ncannot be achieved, regardless of the number of model parameters. Our analysis\nalso delves into the correspondence between online optimization and offline\ngeneralization in similarity learning. The results show that this equivalence\nfails in the presence of label noise in all the scenarios considered.\n","authors":["Nayara Fonseca","Veronica Guidetti"],"pdf_url":"https://arxiv.org/pdf/2201.12803v3.pdf","comment":"v3: version accepted at ECAI 2023 + Supplementary Material"},{"id":"http://arxiv.org/abs/2307.10490v3","updated":"2023-07-24T15:24:17Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n Multi-Modal LLMs","summary":" We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10891v2","updated":"2023-07-24T15:16:46Z","published":"2023-06-19T12:36:54Z","title":"Transformer Training Strategies for Forecasting Multiple Load Time\n Series","summary":" In the smart grid of the future, accurate load forecasts on the level of\nindividual clients can help to balance supply and demand locally and to prevent\ngrid outages. While the number of monitored clients will increase with the\nongoing smart meter rollout, the amount of data per client will always be\nlimited. We evaluate whether a Transformer load forecasting model benefits from\na transfer learning strategy, where a global univariate model is trained on the\nload time series from multiple clients. In experiments with two datasets\ncontaining load time series from several hundred clients, we find that the\nglobal training strategy is superior to the multivariate and local training\nstrategies used in related work. On average, the global training strategy\nresults in 21.8% and 12.8% lower forecasting errors than the two other\nstrategies, measured across forecasting horizons from one day to one month into\nthe future. A comparison to linear models, multi-layer perceptrons and LSTMs\nshows that Transformers are effective for load forecasting when they are\ntrained with the global training strategy.\n","authors":["Matthias Hertel","Maximilian Beichter","Benedikt Heidrich","Oliver Neumann","Benjamin Schäfer","Ralf Mikut","Veit Hagenmeyer"],"pdf_url":"https://arxiv.org/pdf/2306.10891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12872v1","updated":"2023-07-24T15:10:22Z","published":"2023-07-24T15:10:22Z","title":"Data-free Black-box Attack based on Diffusion Model","summary":" Since the training data for the target model in a data-free black-box attack\nis not available, most recent schemes utilize GANs to generate data for\ntraining substitute model. However, these GANs-based schemes suffer from low\ntraining efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a data-free black-box attack scheme based\non diffusion model to improve the efficiency and accuracy of substitute\ntraining. Despite the data generated by the diffusion model exhibits high\nquality, it presents diverse domain distributions and contains many samples\nthat do not meet the discriminative criteria of the target model. To further\nfacilitate the diffusion model to generate data suitable for the target model,\nwe propose a Latent Code Augmentation (LCA) method to guide the diffusion model\nin generating data. With the guidance of LCA, the data generated by the\ndiffusion model not only meets the discriminative criteria of the target model\nbut also exhibits high diversity. By utilizing this data, it is possible to\ntrain substitute model that closely resemble the target model more efficiently.\nExtensive experiments demonstrate that our LCA achieves higher attack success\nrates and requires fewer query budgets compared to GANs-based schemes for\ndifferent target models.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12862v1","updated":"2023-07-24T15:02:03Z","published":"2023-07-24T15:02:03Z","title":"Stochastic Step-wise Feature Selection for Exponential Random Graph\n Models (ERGMs)","summary":" Statistical analysis of social networks provides valuable insights into\ncomplex network interactions across various scientific disciplines. However,\naccurate modeling of networks remains challenging due to the heavy\ncomputational burden and the need to account for observed network dependencies.\nExponential Random Graph Models (ERGMs) have emerged as a promising technique\nused in social network modeling to capture network dependencies by\nincorporating endogenous variables. Nevertheless, using ERGMs poses multiple\nchallenges, including the occurrence of ERGM degeneracy, which generates\nunrealistic and meaningless network structures. To address these challenges and\nenhance the modeling of collaboration networks, we propose and test a novel\napproach that focuses on endogenous variable selection within ERGMs. Our method\naims to overcome the computational burden and improve the accommodation of\nobserved network dependencies, thereby facilitating more accurate and\nmeaningful interpretations of network phenomena in various scientific fields.\nWe conduct empirical testing and rigorous analysis to contribute to the\nadvancement of statistical techniques and offer practical insights for network\nanalysis.\n","authors":["Helal El-Zaatari","Fei Yu","Michael R Kosorok"],"pdf_url":"https://arxiv.org/pdf/2307.12862v1.pdf","comment":"23 pages, 6 tables and 18 figures"},{"id":"http://arxiv.org/abs/2307.12856v1","updated":"2023-07-24T14:56:30Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n Program Synthesis","summary":" Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web navigation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that can complete the tasks on real\nwebsites following natural language instructions. WebAgent plans ahead by\ndecomposing instructions into canonical sub-instructions, summarizes long HTML\ndocuments into task-relevant snippets, and acts on websites via generated\nPython programs from those. We design WebAgent with Flan-U-PaLM, for grounded\ncode generation, and HTML-T5, new pre-trained LLMs for long HTML documents\nusing local and global attention mechanisms and a mixture of long-span\ndenoising objectives, for planning and summarization. We empirically\ndemonstrate that our recipe improves the success on a real website by over 50%,\nand that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%\nhigher success rate than prior SoTA on the MiniWoB web navigation benchmark and\nbetter accuracy on offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12851v1","updated":"2023-07-24T14:51:54Z","published":"2023-07-24T14:51:54Z","title":"Early Neuron Alignment in Two-layer ReLU Networks with Small\n Initialization","summary":" This paper studies the problem of training a two-layer ReLU network for\nbinary classification using gradient flow with small initialization. We\nconsider a training dataset with well-separated input vectors: Any pair of\ninput data with the same label are positively correlated, and any pair with\ndifferent labels are negatively correlated. Our analysis shows that, during the\nearly phase of training, neurons in the first layer try to align with either\nthe positive data or the negative data, depending on its corresponding weight\non the second layer. A careful analysis of the neurons' directional dynamics\nallows us to provide an $\\mathcal{O}(\\frac{\\log n}{\\sqrt{\\mu}})$ upper bound on\nthe time it takes for all neurons to achieve good alignment with the input\ndata, where $n$ is the number of data points and $\\mu$ measures how well the\ndata are separated. After the early alignment phase, the loss converges to zero\nat a $\\mathcal{O}(\\frac{1}{t})$ rate, and the weight matrix on the first layer\nis approximately low-rank. Numerical experiments on the MNIST dataset\nillustrate our theoretical findings.\n","authors":["Hancheng Min","René Vidal","Enrique Mallada"],"pdf_url":"https://arxiv.org/pdf/2307.12851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12840v1","updated":"2023-07-24T14:37:22Z","published":"2023-07-24T14:37:22Z","title":"Efficiently Learning One-Hidden-Layer ReLU Networks via Schur\n Polynomials","summary":" We study the problem of PAC learning a linear combination of $k$ ReLU\nactivations under the standard Gaussian distribution on $\\mathbb{R}^d$ with\nrespect to the square loss. Our main result is an efficient algorithm for this\nlearning task with sample and computational complexity $(dk/\\epsilon)^{O(k)}$,\nwhere $\\epsilon>0$ is the target accuracy. Prior work had given an algorithm\nfor this problem with complexity $(dk/\\epsilon)^{h(k)}$, where the function\n$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our\nalgorithm is near-optimal within the class of Correlational Statistical Query\nalgorithms. At a high-level, our algorithm uses tensor decomposition to\nidentify a subspace such that all the $O(k)$-order moments are small in the\northogonal directions. Its analysis makes essential use of the theory of Schur\npolynomials to show that the higher-moment error tensors are small given that\nthe lower-order ones are.\n","authors":["Ilias Diakonikolas","Daniel M. Kane"],"pdf_url":"https://arxiv.org/pdf/2307.12840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08272v3","updated":"2023-07-24T14:28:11Z","published":"2023-03-14T23:26:55Z","title":"Automated patent extraction powers generative modeling in focused\n chemical spaces","summary":" Deep generative models have emerged as an exciting avenue for inverse\nmolecular design, with progress coming from the interplay between training\nalgorithms and molecular representations. One of the key challenges in their\napplicability to materials science and chemistry has been the lack of access to\nsizeable training datasets with property labels. Published patents contain the\nfirst disclosure of new materials prior to their publication in journals, and\nare a vast source of scientific knowledge that has remained relatively untapped\nin the field of data-driven molecular design. Because patents are filed seeking\nto protect specific uses, molecules in patents can be considered to be weakly\nlabeled into application classes. Furthermore, patents published by the US\nPatent and Trademark Office (USPTO) are downloadable and have machine-readable\ntext and molecular structures. In this work, we train domain-specific\ngenerative models using patent data sources by developing an automated pipeline\nto go from USPTO patent digital files to the generation of novel candidates\nwith minimal human intervention. We test the approach on two in-class extracted\ndatasets, one in organic electronics and another in tyrosine kinase inhibitors.\nWe then evaluate the ability of generative models trained on these in-class\ndatasets on two categories of tasks (distribution learning and property\noptimization), identify strengths and limitations, and suggest possible\nexplanations and remedies that could be used to overcome these in practice.\n","authors":["Akshay Subramanian","Kevin P. Greenman","Alexis Gervaix","Tzuhsiung Yang","Rafael Gómez-Bombarelli"],"pdf_url":"https://arxiv.org/pdf/2303.08272v3.pdf","comment":"Digital Discovery (2023)"},{"id":"http://arxiv.org/abs/2307.02620v2","updated":"2023-07-24T14:21:09Z","published":"2023-07-05T19:48:03Z","title":"Learning when to observe: A frugal reinforcement learning framework for\n a high-cost world","summary":" Reinforcement learning (RL) has been shown to learn sophisticated control\npolicies for complex tasks including games, robotics, heating and cooling\nsystems and text generation. The action-perception cycle in RL, however,\ngenerally assumes that a measurement of the state of the environment is\navailable at each time step without a cost. In applications such as materials\ndesign, deep-sea and planetary robot exploration and medicine, however, there\ncan be a high cost associated with measuring, or even approximating, the state\nof the environment. In this paper, we survey the recently growing literature\nthat adopts the perspective that an RL agent might not need, or even want, a\ncostly measurement at each time step. Within this context, we propose the Deep\nDynamic Multi-Step Observationless Agent (DMSOA), contrast it with the\nliterature and empirically evaluate it on OpenAI gym and Atari Pong\nenvironments. Our results, show that DMSOA learns a better policy with fewer\ndecision steps and measurements than the considered alternative from the\nliterature. The corresponding code is available at:\n\\url{https://github.com/cbellinger27/Learning-when-to-observe-in-RL\n","authors":["Colin Bellinger","Mark Crowley","Isaac Tamblyn"],"pdf_url":"https://arxiv.org/pdf/2307.02620v2.pdf","comment":"Accepted for presentation at ECML-PKDD 2023 workshop track:\n Simplification, Compression, Efficiency and Frugality for Artificial\n Intelligence (SCEFA)"},{"id":"http://arxiv.org/abs/2307.12822v1","updated":"2023-07-24T14:19:36Z","published":"2023-07-24T14:19:36Z","title":"Learning Provably Robust Estimators for Inverse Problems via Jittering","summary":" Deep neural networks provide excellent performance for inverse problems such\nas denoising. However, neural networks can be sensitive to adversarial or\nworst-case perturbations. This raises the question of whether such networks can\nbe trained efficiently to be worst-case robust. In this paper, we investigate\nwhether jittering, a simple regularization technique that adds isotropic\nGaussian noise during training, is effective for learning worst-case robust\nestimators for inverse problems. While well studied for prediction in\nclassification tasks, the effectiveness of jittering for inverse problems has\nnot been systematically investigated. In this paper, we present a novel\nanalytical characterization of the optimal $\\ell_2$-worst-case robust estimator\nfor linear denoising and show that jittering yields optimal robust denoisers.\nFurthermore, we examine jittering empirically via training deep neural networks\n(U-nets) for natural image denoising, deconvolution, and accelerated magnetic\nresonance imaging (MRI). The results show that jittering significantly enhances\nthe worst-case robustness, but can be suboptimal for inverse problems beyond\ndenoising. Moreover, our results imply that training on real data which often\ncontains slight noise is somewhat robustness enhancing.\n","authors":["Anselm Krainovic","Mahdi Soltanolkotabi","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2307.12822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02813v2","updated":"2023-07-24T14:17:24Z","published":"2023-07-06T07:18:22Z","title":"CPDG: A Contrastive Pre-Training Method for Dynamic Graph Neural\n Networks","summary":" Dynamic graph data mining has gained popularity in recent years due to the\nrich information contained in dynamic graphs and their widespread use in the\nreal world. Despite the advances in dynamic graph neural networks (DGNNs), the\nrich information and diverse downstream tasks have posed significant\ndifficulties for the practical application of DGNNs in industrial scenarios. To\nthis end, in this paper, we propose to address them by pre-training and present\nthe Contrastive Pre-Training Method for Dynamic Graph Neural Networks (CPDG).\nCPDG tackles the challenges of pre-training for DGNNs, including generalization\ncapability and long-short term modeling capability, through a flexible\nstructural-temporal subgraph sampler along with structural-temporal contrastive\npre-training schemes. Extensive experiments conducted on both large-scale\nresearch and industrial dynamic graph datasets show that CPDG outperforms\nexisting methods in dynamic graph pre-training for various downstream tasks\nunder three transfer settings.\n","authors":["Yuanchen Bei","Hao Xu","Sheng Zhou","Huixuan Chi","Haishuai Wang","Mengdi Zhang","Zhao Li","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2307.02813v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12797v1","updated":"2023-07-24T13:46:50Z","published":"2023-07-24T13:46:50Z","title":"Causal Fair Machine Learning via Rank-Preserving Interventional\n Distributions","summary":" A decision can be defined as fair if equal individuals are treated equally\nand unequals unequally. Adopting this definition, the task of designing machine\nlearning models that mitigate unfairness in automated decision-making systems\nmust include causal thinking when introducing protected attributes. Following a\nrecent proposal, we define individuals as being normatively equal if they are\nequal in a fictitious, normatively desired (FiND) world, where the protected\nattribute has no (direct or indirect) causal effect on the target. We propose\nrank-preserving interventional distributions to define an estimand of this FiND\nworld and a warping method for estimation. Evaluation criteria for both the\nmethod and resulting model are presented and validated through simulations and\nempirical data. With this, we show that our warping approach effectively\nidentifies the most discriminated individuals and mitigates unfairness.\n","authors":["Ludwig Bothmann","Susanne Dandl","Michael Schomaker"],"pdf_url":"https://arxiv.org/pdf/2307.12797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.05018v3","updated":"2023-07-24T13:46:46Z","published":"2022-07-11T17:13:10Z","title":"Learning Temporally Extended Skills in Continuous Domains as Symbolic\n Actions for Planning","summary":" Problems which require both long-horizon planning and continuous control\ncapabilities pose significant challenges to existing reinforcement learning\nagents. In this paper we introduce a novel hierarchical reinforcement learning\nagent which links temporally extended skills for continuous control with a\nforward model in a symbolic discrete abstraction of the environment's state for\nplanning. We term our agent SEADS for Symbolic Effect-Aware Diverse Skills. We\nformulate an objective and corresponding algorithm which leads to unsupervised\nlearning of a diverse set of skills through intrinsic motivation given a known\nstate abstraction. The skills are jointly learned with the symbolic forward\nmodel which captures the effect of skill execution in the state abstraction.\nAfter training, we can leverage the skills as symbolic actions using the\nforward model for long-horizon planning and subsequently execute the plan using\nthe learned continuous-action control skills. The proposed algorithm learns\nskills and forward models that can be used to solve complex tasks which require\nboth continuous control and long-horizon planning capabilities with high\nsuccess rate. It compares favorably with other flat and hierarchical\nreinforcement learning baseline agents and is successfully demonstrated with a\nreal robot.\n","authors":["Jan Achterhold","Markus Krimmel","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2207.05018v3.pdf","comment":"Project website (including video) is available at\n https://seads.is.tue.mpg.de/. (v2) Accepted for publication at the 6th\n Conference on Robot Learning (CoRL) 2022, Auckland, New Zealand. (v3) Added\n details on checkpointing (S.8.1), with references on p.7, p.8, p.21 to\n clarify number of env. steps of reported results"},{"id":"http://arxiv.org/abs/2307.12790v1","updated":"2023-07-24T13:39:21Z","published":"2023-07-24T13:39:21Z","title":"Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution\n for Medical Image Classification","summary":" Graph-based neural network models are gaining traction in the field of\nrepresentation learning due to their ability to uncover latent topological\nrelationships between entities that are otherwise challenging to identify.\nThese models have been employed across a diverse range of domains, encompassing\ndrug discovery, protein interactions, semantic segmentation, and fluid dynamics\nresearch. In this study, we investigate the potential of Graph Neural Networks\n(GNNs) for medical image classification. We introduce a novel model that\ncombines GNNs and edge convolution, leveraging the interconnectedness of RGB\nchannel feature values to strongly represent connections between crucial graph\nnodes. Our proposed model not only performs on par with state-of-the-art Deep\nNeural Networks (DNNs) but does so with 1000 times fewer parameters, resulting\nin reduced training time and data requirements. We compare our Graph\nConvolutional Neural Network (GCNN) to pre-trained DNNs for classifying\nMedMNIST dataset classes, revealing promising prospects for GNNs in medical\nimage analysis. Our results also encourage further exploration of advanced\ngraph-based models such as Graph Attention Networks (GAT) and Graph\nAuto-Encoders in the medical imaging domain. The proposed model yields more\nreliable, interpretable, and accurate outcomes for tasks like semantic\nsegmentation and image classification compared to simpler GCNNs\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2307.12790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13170v4","updated":"2023-07-24T13:35:28Z","published":"2022-04-27T20:04:24Z","title":"AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias\n Estimation","summary":" In Federated Learning (FL), a number of clients or devices collaborate to\ntrain a model without sharing their data. Models are optimized locally at each\nclient and further communicated to a central hub for aggregation. While FL is\nan appealing decentralized training paradigm, heterogeneity among data from\ndifferent clients can cause the local optimization to drift away from the\nglobal objective. In order to estimate and therefore remove this drift,\nvariance reduction techniques have been incorporated into FL optimization\nrecently. However, these approaches inaccurately estimate the clients' drift\nand ultimately fail to remove it properly. In this work, we propose an adaptive\nalgorithm that accurately estimates drift across clients. In comparison to\nprevious works, our approach necessitates less storage and communication\nbandwidth, as well as lower compute costs. Additionally, our proposed\nmethodology induces stability by constraining the norm of estimates for client\ndrift, making it more practical for large scale FL. Experimental findings\ndemonstrate that the proposed algorithm converges significantly faster and\nachieves higher accuracy than the baselines across various FL benchmarks.\n","authors":["Farshid Varno","Marzie Saghayi","Laya Rafiee Sevyeri","Sharut Gupta","Stan Matwin","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2204.13170v4.pdf","comment":"Published as a conference paper at ECCV 2022; Corrected some typos in\n the text and a baseline algorithm"},{"id":"http://arxiv.org/abs/2307.12788v1","updated":"2023-07-24T13:35:18Z","published":"2023-07-24T13:35:18Z","title":"Analyzing the Strategy of Propaganda using Inverse Reinforcement\n Learning: Evidence from the 2022 Russian Invasion of Ukraine","summary":" The 2022 Russian invasion of Ukraine was accompanied by a large-scale,\npro-Russian propaganda campaign on social media. However, the strategy behind\nthe dissemination of propaganda has remained unclear, particularly how the\nonline discourse was strategically shaped by the propagandists' community.\nHere, we analyze the strategy of the Twitter community using an inverse\nreinforcement learning (IRL) approach. Specifically, IRL allows us to model\nonline behavior as a Markov decision process, where the goal is to infer the\nunderlying reward structure that guides propagandists when interacting with\nusers with a supporting or opposing stance toward the invasion. Thereby, we aim\nto understand empirically whether and how between-user interactions are\nstrategically used to promote the proliferation of Russian propaganda. For\nthis, we leverage a large-scale dataset with 349,455 posts with pro-Russian\npropaganda from 132,131 users. We show that bots and humans follow a different\nstrategy: bots respond predominantly to pro-invasion messages, suggesting that\nthey seek to drive virality; while messages indicating opposition primarily\nelicit responses from humans, suggesting that they tend to engage in critical\ndiscussions. To the best of our knowledge, this is the first study analyzing\nthe strategy behind propaganda from the 2022 Russian invasion of Ukraine\nthrough the lens of IRL.\n","authors":["Dominique Geissler","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2307.12788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12540v2","updated":"2023-07-24T13:35:16Z","published":"2023-03-22T13:16:37Z","title":"Deployment of Image Analysis Algorithms under Prevalence Shifts","summary":" Domain gaps are among the most relevant roadblocks in the clinical\ntranslation of machine learning (ML)-based solutions for medical image\nanalysis. While current research focuses on new training paradigms and network\narchitectures, little attention is given to the specific effect of prevalence\nshifts on an algorithm deployed in practice. Such discrepancies between class\nfrequencies in the data used for a method's development/validation and that in\nits deployment environment(s) are of great importance, for example in the\ncontext of artificial intelligence (AI) democratization, as disease prevalences\nmay vary widely across time and location. Our contribution is twofold. First,\nwe empirically demonstrate the potentially severe consequences of missing\nprevalence handling by analyzing (i) the extent of miscalibration, (ii) the\ndeviation of the decision threshold from the optimum, and (iii) the ability of\nvalidation metrics to reflect neural network performance on the deployment\npopulation as a function of the discrepancy between development and deployment\nprevalence. Second, we propose a workflow for prevalence-aware image\nclassification that uses estimated deployment prevalences to adjust a trained\nclassifier to a new environment, without requiring additional annotated\ndeployment data. Comprehensive experiments based on a diverse set of 30 medical\nclassification tasks showcase the benefit of the proposed workflow in\ngenerating better classifier decisions and more reliable performance estimates\ncompared to current practice.\n","authors":["Patrick Godau","Piotr Kalinowski","Evangelia Christodoulou","Annika Reinke","Minu Tizabi","Luciana Ferrer","Paul Jäger","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.12540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12775v1","updated":"2023-07-24T13:24:56Z","published":"2023-07-24T13:24:56Z","title":"Is attention all you need in medical image analysis? A review","summary":" Medical imaging is a key component in clinical diagnosis, treatment planning\nand clinical trial design, accounting for almost 90% of all healthcare data.\nCNNs achieved performance gains in medical image analysis (MIA) over the last\nyears. CNNs can efficiently model local pixel interactions and be trained on\nsmall-scale MI data. The main disadvantage of typical CNN models is that they\nignore global pixel relationships within images, which limits their\ngeneralisation ability to understand out-of-distribution data with different\n'global' information. The recent progress of Artificial Intelligence gave rise\nto Transformers, which can learn global relationships from data. However, full\nTransformer models need to be trained on large-scale data and involve\ntremendous computational complexity. Attention and Transformer compartments\n(Transf/Attention) which can well maintain properties for modelling global\nrelationships, have been proposed as lighter alternatives of full Transformers.\nRecently, there is an increasing trend to co-pollinate complementary\nlocal-global properties from CNN and Transf/Attention architectures, which led\nto a new era of hybrid models. The past years have witnessed substantial growth\nin hybrid CNN-Transf/Attention models across diverse MIA problems. In this\nsystematic review, we survey existing hybrid CNN-Transf/Attention models,\nreview and unravel key architectural designs, analyse breakthroughs, and\nevaluate current and future opportunities as well as challenges. We also\nintroduced a comprehensive analysis framework on generalisation opportunities\nof scientific and clinical impact, based on which new data-driven domain\ngeneralisation and adaptation methods can be stimulated.\n","authors":["Giorgos Papanastasiou","Nikolaos Dikaios","Jiahao Huang","Chengjia Wang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12771v1","updated":"2023-07-24T13:19:15Z","published":"2023-07-24T13:19:15Z","title":"Detecting disturbances in network-coupled dynamical systems with machine\n learning","summary":" Identifying disturbances in network-coupled dynamical systems without\nknowledge of the disturbances or underlying dynamics is a problem with a wide\nrange of applications. For example, one might want to know which nodes in the\nnetwork are being disturbed and identify the type of disturbance. Here we\npresent a model-free method based on machine learning to identify such unknown\ndisturbances based only on prior observations of the system when forced by a\nknown training function. We find that this method is able to identify the\nlocations and properties of many different types of unknown disturbances using\na variety of known forcing functions. We illustrate our results both with\nlinear and nonlinear disturbances using food web and neuronal activity models.\nFinally, we discuss how to scale our method to large networks.\n","authors":["Per Sebastian Skardal","Juan G. Restrepo"],"pdf_url":"https://arxiv.org/pdf/2307.12771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05732v6","updated":"2023-07-24T13:15:14Z","published":"2022-09-13T04:58:35Z","title":"Rényi Divergence Deep Mutual Learning","summary":" This paper revisits Deep Mutual Learning (DML), a simple yet effective\ncomputing paradigm. We propose using R\\'{e}nyi divergence instead of the KL\ndivergence, which is more flexible and tunable, to improve vanilla DML. This\nmodification is able to consistently improve performance over vanilla DML with\nlimited additional complexity. The convergence properties of the proposed\nparadigm are analyzed theoretically, and Stochastic Gradient Descent with a\nconstant learning rate is shown to converge with $\\mathcal{O}(1)$-bias in the\nworst case scenario for nonconvex optimization tasks. That is, learning will\nreach nearby local optima but continue searching within a bounded scope, which\nmay help mitigate overfitting. Finally, our extensive empirical results\ndemonstrate the advantage of combining DML and R\\'{e}nyi divergence, leading to\nfurther improvement in model generalization.\n","authors":["Weipeng Huang","Junjie Tao","Changbo Deng","Ming Fan","Wenqiang Wan","Qi Xiong","Guangyuan Piao"],"pdf_url":"https://arxiv.org/pdf/2209.05732v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11531v2","updated":"2023-07-24T13:04:48Z","published":"2022-09-23T11:36:32Z","title":"Deep Learning-based Anonymization of Chest Radiographs: A\n Utility-preserving Measure for Patient Privacy","summary":" Robust and reliable anonymization of chest radiographs constitutes an\nessential step before publishing large datasets of such for research purposes.\nThe conventional anonymization process is carried out by obscuring personal\ninformation in the images with black boxes and removing or replacing\nmeta-information. However, such simple measures retain biometric information in\nthe chest radiographs, allowing patients to be re-identified by a linkage\nattack. Therefore, there is an urgent need to obfuscate the biometric\ninformation appearing in the images. We propose the first deep learning-based\napproach (PriCheXy-Net) to targetedly anonymize chest radiographs while\nmaintaining data utility for diagnostic and machine learning purposes. Our\nmodel architecture is a composition of three independent neural networks that,\nwhen collectively used, allow for learning a deformation field that is able to\nimpede patient re-identification. Quantitative results on the ChestX-ray14\ndataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)\nafter re-training with little impact on the abnormality classification\nperformance. This indicates the ability to preserve underlying abnormality\npatterns while increasing patient privacy. Lastly, we compare our proposed\nanonymization approach with two other obfuscation-based methods (Privacy-Net,\nDP-Pix) and demonstrate the superiority of our method towards resolving the\nprivacy-utility trade-off for chest radiographs.\n","authors":["Kai Packhäuser","Sebastian Gündel","Florian Thamm","Felix Denzinger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2209.11531v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.07620v2","updated":"2023-07-24T13:03:17Z","published":"2023-07-14T20:39:07Z","title":"Generalizable Embeddings with Cross-batch Metric Learning","summary":" Global average pooling (GAP) is a popular component in deep metric learning\n(DML) for aggregating features. Its effectiveness is often attributed to\ntreating each feature vector as a distinct semantic entity and GAP as a\ncombination of them. Albeit substantiated, such an explanation's algorithmic\nimplications to learn generalizable entities to represent unseen classes, a\ncrucial DML goal, remain unclear. To address this, we formulate GAP as a convex\ncombination of learnable prototypes. We then show that the prototype learning\ncan be expressed as a recursive process fitting a linear predictor to a batch\nof samples. Building on that perspective, we consider two batches of disjoint\nclasses at each iteration and regularize the learning by expressing the samples\nof a batch with the prototypes that are fitted to the other batch. We validate\nour approach on 4 popular DML benchmarks.\n","authors":["Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2307.07620v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2212.07368v3","updated":"2023-07-24T12:53:23Z","published":"2022-12-14T17:46:17Z","title":"Shuffled Multi-Channel Sparse Signal Recovery","summary":" Mismatches between samples and their respective channel or target commonly\narise in several real-world applications. For instance, whole-brain calcium\nimaging of freely moving organisms, multiple-target tracking or multi-person\ncontactless vital sign monitoring may be severely affected by mismatched\nsample-channel assignments. To systematically address this fundamental problem,\nwe pose it as a signal reconstruction problem where we have lost\ncorrespondences between the samples and their respective channels. Assuming\nthat we have a sensing matrix for the underlying signals, we show that the\nproblem is equivalent to a structured unlabeled sensing problem, and establish\nsufficient conditions for unique recovery. To the best of our knowledge, a\nsampling result for the reconstruction of shuffled multi-channel signals has\nnot been considered in the literature and existing methods for unlabeled\nsensing cannot be directly applied. We extend our results to the case where the\nsignals admit a sparse representation in an overcomplete dictionary (i.e., the\nsensing matrix is not precisely known), and derive sufficient conditions for\nthe reconstruction of shuffled sparse signals. We propose a robust\nreconstruction method that combines sparse signal recovery with robust linear\nregression for the two-channel case. The performance and robustness of the\nproposed approach is illustrated in an application related to whole-brain\ncalcium imaging. The proposed methodology can be generalized to sparse signal\nrepresentations other than the ones considered in this work to be applied in a\nvariety of real-world problems with imprecise measurement or channel\nassignment.\n","authors":["Taulant Koka","Manolis C. Tsakiris","Michael Muma","Benjamín Béjar Haro"],"pdf_url":"https://arxiv.org/pdf/2212.07368v3.pdf","comment":"Submitted to TSP"},{"id":"http://arxiv.org/abs/2307.12754v1","updated":"2023-07-24T12:52:55Z","published":"2023-07-24T12:52:55Z","title":"Nonparametric Linear Feature Learning in Regression Through\n Regularisation","summary":" Representation learning plays a crucial role in automated feature selection,\nparticularly in the context of high-dimensional data, where non-parametric\nmethods often struggle. In this study, we focus on supervised learning\nscenarios where the pertinent information resides within a lower-dimensional\nlinear subspace of the data, namely the multi-index model. If this subspace\nwere known, it would greatly enhance prediction, computation, and\ninterpretation. To address this challenge, we propose a novel method for linear\nfeature learning with non-parametric prediction, which simultaneously estimates\nthe prediction function and the linear subspace. Our approach employs empirical\nrisk minimisation, augmented with a penalty on function derivatives, ensuring\nversatility. Leveraging the orthogonality and rotation invariance properties of\nHermite polynomials, we introduce our estimator, named RegFeaL. By utilising\nalternative minimisation, we iteratively rotate the data to improve alignment\nwith leading directions and accurately estimate the relevant dimension in\npractical settings. We establish that our method yields a consistent estimator\nof the prediction function with explicit rates. Additionally, we provide\nempirical results demonstrating the performance of RegFeaL in various\nexperiments.\n","authors":["Bertille Follain","Umut Simsekli","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2307.12754v1.pdf","comment":"43 pages, 16 figures"},{"id":"http://arxiv.org/abs/2307.12745v1","updated":"2023-07-24T12:36:05Z","published":"2023-07-24T12:36:05Z","title":"Concept-based explainability for an EEG transformer model","summary":" Deep learning models are complex due to their size, structure, and inherent\nrandomness in training procedures. Additional complexity arises from the\nselection of datasets and inductive biases. Addressing these challenges for\nexplainability, Kim et al. (2018) introduced Concept Activation Vectors (CAVs),\nwhich aim to understand deep models' internal states in terms of human-aligned\nconcepts. These concepts correspond to directions in latent space, identified\nusing linear discriminants. Although this method was first applied to image\nclassification, it was later adapted to other domains, including natural\nlanguage processing. In this work, we attempt to apply the method to\nelectroencephalogram (EEG) data for explainability in Kostas et al.'s BENDR\n(2021), a large-scale transformer model. A crucial part of this endeavor\ninvolves defining the explanatory concepts and selecting relevant datasets to\nground concepts in the latent space. Our focus is on two mechanisms for EEG\nconcept formation: the use of externally labeled EEG datasets, and the\napplication of anatomically defined concepts. The former approach is a\nstraightforward generalization of methods used in image classification, while\nthe latter is novel and specific to EEG. We present evidence that both\napproaches to concept formation yield valuable insights into the\nrepresentations learned by deep EEG models.\n","authors":["Anders Gjølbye Madsen","William Theodor Lehn-Schiøler","Áshildur Jónsdóttir","Bergdís Arnardóttir","Lars Kai Hansen"],"pdf_url":"https://arxiv.org/pdf/2307.12745v1.pdf","comment":"To appear in proceedings of 2023 IEEE International workshop on\n Machine Learning for Signal Processing"},{"id":"http://arxiv.org/abs/2207.09657v3","updated":"2023-07-24T12:35:18Z","published":"2022-07-20T05:22:26Z","title":"Reducing Training Time in Cross-Silo Federated Learning using Multigraph\n Topology","summary":" Federated learning is an active research topic since it enables several\nparticipants to jointly train a model without sharing local data. Currently,\ncross-silo federated learning is a popular training setting that utilizes a few\nhundred reliable data silos with high-speed access links to training a model.\nWhile this approach has been widely applied in real-world scenarios, designing\na robust topology to reduce the training time remains an open problem. In this\npaper, we present a new multigraph topology for cross-silo federated learning.\nWe first construct the multigraph using the overlay graph. We then parse this\nmultigraph into different simple graphs with isolated nodes. The existence of\nisolated nodes allows us to perform model aggregation without waiting for other\nnodes, hence effectively reducing the training time. Intensive experiments on\nthree public datasets show that our proposed method significantly reduces the\ntraining time compared with recent state-of-the-art topologies while\nmaintaining the accuracy of the learned model. Our code can be found at\nhttps://github.com/aioz-ai/MultigraphFL\n","authors":["Tuong Do","Binh X. Nguyen","Vuong Pham","Toan Tran","Erman Tjiputra","Quang Tran","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2207.09657v3.pdf","comment":"accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2302.09629v2","updated":"2023-07-24T12:33:09Z","published":"2023-02-19T17:15:56Z","title":"BiofilmScanner: A Computational Intelligence Approach to Obtain\n Bacterial Cell Morphological Attributes from Biofilm Image","summary":" Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for\nsulfate-reducing bacteria (SRB) that are associated with corrosion issues\ncaused by microorganisms. SRB-based biofilms are thought to be responsible for\nthe billion-dollar-per-year bio-corrosion of metal infrastructure.\nUnderstanding the extraction of the bacterial cells' shape and size properties\nin the SRB-biofilm at different growth stages will assist with the design of\nanti-corrosion techniques. However, numerous issues affect current approaches,\nincluding time-consuming geometric property extraction, low efficiency, and\nhigh error rates. This paper proposes BiofilScanner, a Yolact-based deep\nlearning method integrated with invariant moments to address these problems.\nOur approach efficiently detects and segments bacterial cells in an SRB image\nwhile simultaneously invariant moments measure the geometric characteristics of\nthe segmented cells with low errors. The numerical experiments of the proposed\nmethod demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our\nearlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring\nthe geometric properties of the cell. Furthermore, the BiofilmScanner achieved\nan F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%\nand 75.18%, respectively.\n","authors":["Md Hafizur Rahman","Md Ali Azam","Md Abir Hossen","Shankarachary Ragi","Venkataramana Gadhamshetty"],"pdf_url":"https://arxiv.org/pdf/2302.09629v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2306.16177v3","updated":"2023-07-24T12:32:58Z","published":"2023-06-28T12:58:42Z","title":"Defining data science: a new field of inquiry","summary":" Data science is not a science. It is a research paradigm. Its power, scope,\nand scale will surpass science, our most powerful research paradigm, to enable\nknowledge discovery and change our world. We have yet to understand and define\nit, vital to realizing its potential and managing its risks. Modern data\nscience is in its infancy. Emerging slowly since 1962 and rapidly since 2000,\nit is a fundamentally new field of inquiry, one of the most active, powerful,\nand rapidly evolving 21st century innovations. Due to its value, power, and\napplicability, it is emerging in over 40 disciplines, hundreds of research\nareas, and thousands of applications. Millions of data science publications\ncontain myriad definitions of data science and data science problem solving.\nDue to its infancy, many definitions are independent, application specific,\nmutually incomplete, redundant, or inconsistent, hence so is data science. This\nresearch addresses this data science multiple definitions challenge by\nproposing the development of coherent, unified definition based on a data\nscience reference framework using a data science journal for the data science\ncommunity to achieve such a definition. This paper provides candidate\ndefinitions for essential data science artifacts that are required to discuss\nsuch a definition. They are based on the classical research paradigm concept\nconsisting of a philosophy of data science, the data science problem solving\nparadigm, and the six component data science reference framework (axiology,\nontology, epistemology, methodology, methods, technology) that is a frequently\ncalled for unifying framework with which to define, unify, and evolve data\nscience. It presents challenges for defining data science, solution approaches,\ni.e., means for defining data science, and their requirements and benefits as\nthe basis of a comprehensive solution.\n","authors":["Michael L Brodie"],"pdf_url":"https://arxiv.org/pdf/2306.16177v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12865v3","updated":"2023-07-24T12:08:50Z","published":"2023-03-22T18:59:48Z","title":"NeRF-GAN Distillation for Efficient 3D-Aware Generation with\n Convolutions","summary":" Pose-conditioned convolutional generative models struggle with high-quality\n3D-consistent image generation from single-view datasets, due to their lack of\nsufficient 3D priors. Recently, the integration of Neural Radiance Fields\n(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),\nhas transformed 3D-aware generation from single-view images. NeRF-GANs exploit\nthe strong inductive bias of neural 3D representations and volumetric rendering\nat the cost of higher computational complexity. This study aims at revisiting\npose-conditioned 2D GANs for efficient 3D-aware generation at inference time by\ndistilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and\neffective method, based on re-using the well-disentangled latent space of a\npre-trained NeRF-GAN in a pose-conditioned convolutional network to directly\ngenerate 3D-consistent images corresponding to the underlying 3D\nrepresentations. Experiments on several datasets demonstrate that the proposed\nmethod obtains results comparable with volumetric rendering in terms of quality\nand 3D consistency while benefiting from the computational advantage of\nconvolutional networks. The code will be available at:\nhttps://github.com/mshahbazi72/NeRF-GAN-Distillation\n","authors":["Mohamad Shahbazi","Evangelos Ntavelis","Alessio Tonioni","Edo Collins","Danda Pani Paudel","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12716v1","updated":"2023-07-24T11:55:32Z","published":"2023-07-24T11:55:32Z","title":"Safety Performance of Neural Networks in the Presence of Covariate Shift","summary":" Covariate shift may impact the operational safety performance of neural\nnetworks. A re-evaluation of the safety performance, however, requires\ncollecting new operational data and creating corresponding ground truth labels,\nwhich often is not possible during operation. We are therefore proposing to\nreshape the initial test set, as used for the safety performance evaluation\nprior to deployment, based on an approximation of the operational data. This\napproximation is obtained by observing and learning the distribution of\nactivation patterns of neurons in the network during operation. The reshaped\ntest set reflects the distribution of neuron activation values as observed\nduring operation, and may therefore be used for re-evaluating safety\nperformance in the presence of covariate shift. First, we derive conservative\nbounds on the values of neurons by applying finite binning and static dataflow\nanalysis. Second, we formulate a mixed integer linear programming (MILP)\nconstraint for constructing the minimum set of data points to be removed in the\ntest set, such that the difference between the discretized test and operational\ndistributions is bounded. We discuss potential benefits and limitations of this\nconstraint-based approach based on our initial experience with an implemented\nresearch prototype.\n","authors":["Chih-Hong Cheng","Harald Ruess","Konstantinos Theodorou"],"pdf_url":"https://arxiv.org/pdf/2307.12716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13871v2","updated":"2023-07-24T11:44:01Z","published":"2023-04-26T23:34:40Z","title":"Typical and atypical solutions in non-convex neural networks with\n discrete and continuous weights","summary":" We study the binary and continuous negative-margin perceptrons as simple\nnon-convex neural network models learning random rules and associations. We\nanalyze the geometry of the landscape of solutions in both models and find\nimportant similarities and differences. Both models exhibit subdominant\nminimizers which are extremely flat and wide. These minimizers coexist with a\nbackground of dominant solutions which are composed by an exponential number of\nalgorithmically inaccessible small clusters for the binary case (the frozen\n1-RSB phase) or a hierarchical structure of clusters of different sizes for the\nspherical case (the full RSB phase). In both cases, when a certain threshold in\nconstraint density is crossed, the local entropy of the wide flat minima\nbecomes non-monotonic, indicating a break-up of the space of robust solutions\ninto disconnected components. This has a strong impact on the behavior of\nalgorithms in binary models, which cannot access the remaining isolated\nclusters. For the spherical case the behaviour is different, since even beyond\nthe disappearance of the wide flat minima the remaining solutions are shown to\nalways be surrounded by a large number of other solutions at any distance, up\nto capacity. Indeed, we exhibit numerical evidence that algorithms seem to find\nsolutions up to the SAT/UNSAT transition, that we compute here using an 1RSB\napproximation. For both models, the generalization performance as a learning\ndevice is shown to be greatly improved by the existence of wide flat minimizers\neven when trained in the highly underconstrained regime of very negative\nmargins.\n","authors":["Carlo Baldassi","Enrico M. Malatesta","Gabriele Perugini","Riccardo Zecchina"],"pdf_url":"https://arxiv.org/pdf/2304.13871v2.pdf","comment":"34 pages, 13 figures"},{"id":"http://arxiv.org/abs/2210.17230v3","updated":"2023-07-24T11:43:26Z","published":"2022-10-31T11:15:48Z","title":"Lipschitz-regularized gradient flows and generative particle algorithms\n for high-dimensional scarce data","summary":" We build a new class of generative algorithms capable of efficiently learning\nan arbitrary target distribution from possibly scarce, high-dimensional data\nand subsequently generate new samples. These generative algorithms are\nparticle-based and are constructed as gradient flows of Lipschitz-regularized\nKullback-Leibler or other $f$-divergences, where data from a source\ndistribution can be stably transported as particles, towards the vicinity of\nthe target distribution. As a highlighted result in data integration, we\ndemonstrate that the proposed algorithms correctly transport gene expression\ndata points with dimension exceeding 54K, while the sample size is typically\nonly in the hundreds.\n","authors":["Hyemin Gu","Panagiota Birmpa","Yannis Pantazis","Luc Rey-Bellet","Markos A. Katsoulakis"],"pdf_url":"https://arxiv.org/pdf/2210.17230v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12703v1","updated":"2023-07-24T11:37:02Z","published":"2023-07-24T11:37:02Z","title":"Policy Gradient Optimal Correlation Search for Variance Reduction in\n Monte Carlo simulation and Maximum Optimal Transport","summary":" We propose a new algorithm for variance reduction when estimating $f(X_T)$\nwhere $X$ is the solution to some stochastic differential equation and $f$ is a\ntest function. The new estimator is $(f(X^1_T) + f(X^2_T))/2$, where $X^1$ and\n$X^2$ have same marginal law as $X$ but are pathwise correlated so that to\nreduce the variance. The optimal correlation function $\\rho$ is approximated by\na deep neural network and is calibrated along the trajectories of $(X^1, X^2)$\nby policy gradient and reinforcement learning techniques. Finding an optimal\ncoupling given marginal laws has links with maximum optimal transport.\n","authors":["Pierre Bras","Gilles Pagès"],"pdf_url":"https://arxiv.org/pdf/2307.12703v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2303.09340v3","updated":"2023-07-24T11:34:21Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":" Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v3.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.12698v1","updated":"2023-07-24T11:27:14Z","published":"2023-07-24T11:27:14Z","title":"MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised\n Learning of Motion and Content Features","summary":" Self-supervised learning of visual representations has been focusing on\nlearning content features, which do not capture object motion or location, and\nfocus on identifying and differentiating objects in images and videos. On the\nother hand, optical flow estimation is a task that does not involve\nunderstanding the content of the images on which it is estimated. We unify the\ntwo approaches and introduce MC-JEPA, a joint-embedding predictive architecture\nand self-supervised learning approach to jointly learn optical flow and content\nfeatures within a shared encoder, demonstrating that the two associated\nobjectives; the optical flow estimation objective and the self-supervised\nlearning objective; benefit from each other and thus learn content features\nthat incorporate motion information. The proposed approach achieves performance\non-par with existing unsupervised optical flow benchmarks, as well as with\ncommon self-supervised learning approaches on downstream tasks such as semantic\nsegmentation of images and videos.\n","authors":["Adrien Bardes","Jean Ponce","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2307.12698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10763v3","updated":"2023-07-24T11:15:47Z","published":"2023-02-12T12:19:57Z","title":"Contrastive Learning and the Emergence of Attributes Associations","summary":" In response to an object presentation, supervised learning schemes generally\nrespond with a parsimonious label. Upon a similar presentation we humans\nrespond again with a label, but are flooded, in addition, by a myriad of\nassociations. A significant portion of these consist of the presented object\nattributes. Contrastive learning is a semi-supervised learning scheme based on\nthe application of identity preserving transformations on the object input\nrepresentations. It is conjectured in this work that these same applied\ntransformations preserve, in addition to the identity of the presented object,\nalso the identity of its semantically meaningful attributes. The corollary of\nthis is that the output representations of such a contrastive learning scheme\ncontain valuable information not only for the classification of the presented\nobject, but also for the presence or absence decision of any attribute of\ninterest. Simulation results which demonstrate this idea and the feasibility of\nthis conjecture are presented.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2302.10763v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2210.12583v2","updated":"2023-07-24T11:13:21Z","published":"2022-10-23T00:45:05Z","title":"Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model\n Predictive Control","summary":" Model-based control requires an accurate model of the system dynamics for\nprecisely and safely controlling the robot in complex and dynamic environments.\nMoreover, in the presence of variations in the operating conditions, the model\nshould be continuously refined to compensate for dynamics changes. In this\npaper, we present a self-supervised learning approach that actively models the\ndynamics of nonlinear robotic systems. We combine offline learning from past\nexperience and online learning from current robot interaction with the unknown\nenvironment. These two ingredients enable a highly sample-efficient and\nadaptive learning process, capable of accurately inferring model dynamics in\nreal-time even in operating regimes that greatly differ from the training\ndistribution. Moreover, we design an uncertainty-aware model predictive\ncontroller that is heuristically conditioned to the aleatoric (data)\nuncertainty of the learned dynamics. This controller actively chooses the\noptimal control actions that (i) optimize the control performance and (ii)\nimprove the efficiency of online learning sample collection. We demonstrate the\neffectiveness of our method through a series of challenging real-world\nexperiments using a quadrotor system. Our approach showcases high resilience\nand generalization capabilities by consistently adapting to unseen flight\nconditions, while it significantly outperforms classical and adaptive control\nbaselines.\n","authors":["Alessandro Saviolo","Jonathan Frey","Abhishek Rathod","Moritz Diehl","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2210.12583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12689v1","updated":"2023-07-24T11:04:22Z","published":"2023-07-24T11:04:22Z","title":"Addressing the Impact of Localized Training Data in Graph Neural\n Networks","summary":" Graph Neural Networks (GNNs) have achieved notable success in learning from\ngraph-structured data, owing to their ability to capture intricate dependencies\nand relationships between nodes. They excel in various applications, including\nsemi-supervised node classification, link prediction, and graph generation.\nHowever, it is important to acknowledge that the majority of state-of-the-art\nGNN models are built upon the assumption of an in-distribution setting, which\nhinders their performance on real-world graphs with dynamic structures. In this\narticle, we aim to assess the impact of training GNNs on localized subsets of\nthe graph. Such restricted training data may lead to a model that performs well\nin the specific region it was trained on but fails to generalize and make\naccurate predictions for the entire graph. In the context of graph-based\nsemi-supervised learning (SSL), resource constraints often lead to scenarios\nwhere the dataset is large, but only a portion of it can be labeled, affecting\nthe model's performance. This limitation affects tasks like anomaly detection\nor spam detection when labeling processes are biased or influenced by human\nsubjectivity. To tackle the challenges posed by localized training data, we\napproach the problem as an out-of-distribution (OOD) data issue by by aligning\nthe distributions between the training data, which represents a small portion\nof labeled data, and the graph inference process that involves making\npredictions for the entire graph. We propose a regularization method to\nminimize distributional discrepancies between localized training data and graph\ninference, improving model performance on OOD data. Extensive tests on popular\nGNN models show significant performance improvement on three citation GNN\nbenchmark datasets. The regularization approach effectively enhances model\nadaptation and generalization, overcoming challenges posed by OOD data.\n","authors":["Singh Akansha"],"pdf_url":"https://arxiv.org/pdf/2307.12689v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.12679v1","updated":"2023-07-24T10:33:32Z","published":"2023-07-24T10:33:32Z","title":"An Estimator for the Sensitivity to Perturbations of Deep Neural\n Networks","summary":" For Deep Neural Networks (DNNs) to become useful in safety-critical\napplications, such as self-driving cars and disease diagnosis, they must be\nstable to perturbations in input and model parameters. Characterizing the\nsensitivity of a DNN to perturbations is necessary to determine minimal\nbit-width precision that may be used to safely represent the network. However,\nno general result exists that is capable of predicting the sensitivity of a\ngiven DNN to round-off error, noise, or other perturbations in input. This\npaper derives an estimator that can predict such quantities. The estimator is\nderived via inequalities and matrix norms, and the resulting quantity is\nroughly analogous to a condition number for the entire neural network. An\napproximation of the estimator is tested on two Convolutional Neural Networks,\nAlexNet and VGG-19, using the ImageNet dataset. For each of these networks, the\ntightness of the estimator is explored via random perturbations and adversarial\nattacks.\n","authors":["Naman Maheshwari","Nicholas Malaya","Scott Moe","Jaydeep P. Kulkarni","Sudhanva Gurumurthi"],"pdf_url":"https://arxiv.org/pdf/2307.12679v1.pdf","comment":"Actual work and paper concluded in January 2019"},{"id":"http://arxiv.org/abs/2307.12672v1","updated":"2023-07-24T10:20:14Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n Image Modeling","summary":" In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12667v1","updated":"2023-07-24T10:14:51Z","published":"2023-07-24T10:14:51Z","title":"TransFusion: Generating Long, High Fidelity Time Series using Diffusion\n Models with Transformers","summary":" The generation of high-quality, long-sequenced time-series data is essential\ndue to its wide range of applications. In the past, standalone Recurrent and\nConvolutional Neural Network-based Generative Adversarial Networks (GAN) were\nused to synthesize time-series data. However, they are inadequate for\ngenerating long sequences of time-series data due to limitations in the\narchitecture. Furthermore, GANs are well known for their training instability\nand mode collapse problem. To address this, we propose TransFusion, a\ndiffusion, and transformers-based generative model to generate high-quality\nlong-sequence time-series data. We have stretched the sequence length to 384,\nand generated high-quality synthetic data. To the best of our knowledge, this\nis the first study that has been done with this long-sequence length. Also, we\nintroduce two evaluation metrics to evaluate the quality of the synthetic data\nas well as its predictive characteristics. We evaluate TransFusion with a wide\nvariety of visual and empirical metrics, and TransFusion outperforms the\nprevious state-of-the-art by a significant margin.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2307.12667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12660v1","updated":"2023-07-24T10:04:27Z","published":"2023-07-24T10:04:27Z","title":"Online Continual Learning in Keyword Spotting for Low-Resource Devices\n via Pooling High-Order Temporal Statistics","summary":" Keyword Spotting (KWS) models on embedded devices should adapt fast to new\nuser-defined words without forgetting previous ones. Embedded devices have\nlimited storage and computational resources, thus, they cannot save samples or\nupdate large models. We consider the setup of embedded online continual\nlearning (EOCL), where KWS models with frozen backbone are trained to\nincrementally recognize new words from a non-repeated stream of samples, seen\none at a time. To this end, we propose Temporal Aware Pooling (TAP) which\nconstructs an enriched feature space computing high-order moments of speech\nfeatures extracted by a pre-trained backbone. Our method, TAP-SLDA, updates a\nGaussian model for each class on the enriched feature space to effectively use\naudio representations. In experimental analyses, TAP-SLDA outperforms\ncompetitors on several setups, backbones, and baselines, bringing a relative\naverage gain of 11.3% on the GSC dataset.\n","authors":["Umberto Michieli","Pablo Peso Parada","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.12660v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2306.12231v2","updated":"2023-07-24T09:36:05Z","published":"2023-06-21T12:44:52Z","title":"Predicting protein variants with equivariant graph neural networks","summary":" Pre-trained models have been successful in many protein engineering tasks.\nMost notably, sequence-based models have achieved state-of-the-art performance\non protein fitness prediction while structure-based models have been used\nexperimentally to develop proteins with enhanced functions. However, there is a\nresearch gap in comparing structure- and sequence-based methods for predicting\nprotein variants that are better than the wildtype protein. This paper aims to\naddress this gap by conducting a comparative study between the abilities of\nequivariant graph neural networks (EGNNs) and sequence-based approaches to\nidentify promising amino-acid mutations. The results show that our proposed\nstructural approach achieves a competitive performance to sequence-based\nmethods while being trained on significantly fewer molecules. Additionally, we\nfind that combining assay labelled data with structure pre-trained models\nyields similar trends as with sequence pre-trained models.\n Our code and trained models can be found at:\nhttps://github.com/semiluna/partIII-amino-acid-prediction.\n","authors":["Antonia Boca","Simon Mathis"],"pdf_url":"https://arxiv.org/pdf/2306.12231v2.pdf","comment":"4 pages, 2 figures, accepted to the 2023 ICML Workshop on\n Computational Biology"},{"id":"http://arxiv.org/abs/2307.12644v1","updated":"2023-07-24T09:35:47Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n of rPPG","summary":" Remote Photoplethysmography (rPPG) is a technology that utilizes the light\nabsorption properties of hemoglobin, captured via camera, to analyze and\nmeasure blood volume pulse (BVP). By analyzing the measured BVP, various\nphysiological signals such as heart rate, stress levels, and blood pressure can\nbe derived, enabling applications such as the early prediction of\ncardiovascular diseases. rPPG is a rapidly evolving field as it allows the\nmeasurement of vital signals using camera-equipped devices without the need for\nadditional devices such as blood pressure monitors or pulse oximeters, and\nwithout the assistance of medical experts. Despite extensive efforts and\nadvances in this field, serious challenges remain, including issues related to\nskin color, camera characteristics, ambient lighting, and other sources of\nnoise, which degrade performance accuracy. We argue that fair and evaluable\nbenchmarking is urgently required to overcome these challenges and make any\nmeaningful progress from both academic and commercial perspectives. In most\nexisting work, models are trained, tested, and validated only on limited\ndatasets. Worse still, some studies lack available code or reproducibility,\nmaking it difficult to fairly evaluate and compare performance. Therefore, the\npurpose of this study is to provide a benchmarking framework to evaluate\nvarious rPPG techniques across a wide range of datasets for fair evaluation and\ncomparison, including both conventional non-deep neural network (non-DNN) and\ndeep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg.\n","authors":["Dae Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.12639v1","updated":"2023-07-24T09:30:30Z","published":"2023-07-24T09:30:30Z","title":"Fake News Detection Through Graph-based Neural Networks: A Survey","summary":" The popularity of online social networks has enabled rapid dissemination of\ninformation. People now can share and consume information much more rapidly\nthan ever before. However, low-quality and/or accidentally/deliberately fake\ninformation can also spread rapidly. This can lead to considerable and negative\nimpacts on society. Identifying, labelling and debunking online misinformation\nas early as possible has become an increasingly urgent problem. Many methods\nhave been proposed to detect fake news including many deep learning and\ngraph-based approaches. In recent years, graph-based methods have yielded\nstrong results, as they can closely model the social context and propagation\nprocess of online news. In this paper, we present a systematic review of fake\nnews detection studies based on graph-based and deep learning-based techniques.\nWe classify existing graph-based methods into knowledge-driven methods,\npropagation-based methods, and heterogeneous social context-based methods,\ndepending on how a graph structure is constructed to model news related\ninformation flows. We further discuss the challenges and open problems in\ngraph-based fake news detection and identify future research directions.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2307.12639v1.pdf","comment":"18 pages, 3 tables, 7 figures"},{"id":"http://arxiv.org/abs/2304.03981v2","updated":"2023-07-24T09:24:04Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n Identification","summary":" Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12636v1","updated":"2023-07-24T09:19:38Z","published":"2023-07-24T09:19:38Z","title":"Identifying drivers and mitigators for congestion and redispatch in the\n German electric power system with explainable AI","summary":" The transition to a sustainable energy supply challenges the operation of\nelectric power systems in manifold ways. Transmission grid loads increase as\nwind and solar power are often installed far away from the consumers. In\nextreme cases, system operators must intervene via countertrading or redispatch\nto ensure grid stability. In this article, we provide a data-driven analysis of\ncongestion in the German transmission grid. We develop an explainable machine\nlearning model to predict the volume of redispatch and countertrade on an\nhourly basis. The model reveals factors that drive or mitigate grid congestion\nand quantifies their impact. We show that, as expected, wind power generation\nis the main driver, but hydropower and cross-border electricity trading also\nplay an essential role. Solar power, on the other hand, has no mitigating\neffect. Our results suggest that a change to the market design would alleviate\ncongestion.\n","authors":["Maurizio Titz","Sebastian Pütz","Dirk Witthaut"],"pdf_url":"https://arxiv.org/pdf/2307.12636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14430v3","updated":"2023-07-24T09:15:02Z","published":"2022-09-28T21:31:43Z","title":"Minimax Optimal Kernel Operator Learning via Multilevel Training","summary":" Learning mappings between infinite-dimensional function spaces has achieved\nempirical success in many disciplines of machine learning, including generative\nmodeling, functional data analysis, causal inference, and multi-agent\nreinforcement learning. In this paper, we study the statistical limit of\nlearning a Hilbert-Schmidt operator between two infinite-dimensional Sobolev\nreproducing kernel Hilbert spaces. We establish the information-theoretic lower\nbound in terms of the Sobolev Hilbert-Schmidt norm and show that a\nregularization that learns the spectral components below the bias contour and\nignores the ones that are above the variance contour can achieve the optimal\nlearning rate. At the same time, the spectral components between the bias and\nvariance contours give us flexibility in designing computationally feasible\nmachine learning algorithms. Based on this observation, we develop a multilevel\nkernel operator learning algorithm that is optimal when learning linear\noperators between infinite-dimensional function spaces.\n","authors":["Jikai Jin","Yiping Lu","Jose Blanchet","Lexing Ying"],"pdf_url":"https://arxiv.org/pdf/2209.14430v3.pdf","comment":"ICLR 2023 spotlight"},{"id":"http://arxiv.org/abs/2307.12625v1","updated":"2023-07-24T08:56:25Z","published":"2023-07-24T08:56:25Z","title":"De-confounding Representation Learning for Counterfactual Inference on\n Continuous Treatment via Generative Adversarial Network","summary":" Counterfactual inference for continuous rather than binary treatment\nvariables is more common in real-world causal inference tasks. While there are\nalready some sample reweighting methods based on Marginal Structural Model for\neliminating the confounding bias, they generally focus on removing the\ntreatment's linear dependence on confounders and rely on the accuracy of the\nassumed parametric models, which are usually unverifiable. In this paper, we\npropose a de-confounding representation learning (DRL) framework for\ncounterfactual outcome estimation of continuous treatment by generating the\nrepresentations of covariates disentangled with the treatment variables. The\nDRL is a non-parametric model that eliminates both linear and nonlinear\ndependence between treatment and covariates. Specifically, we train the\ncorrelations between the de-confounded representations and the treatment\nvariables against the correlations between the covariate representations and\nthe treatment variables to eliminate confounding bias. Further, a\ncounterfactual inference network is embedded into the framework to make the\nlearned representations serve both de-confounding and trusted inference.\nExtensive experiments on synthetic datasets show that the DRL model performs\nsuperiorly in learning de-confounding representations and outperforms\nstate-of-the-art counterfactual inference models for continuous treatment\nvariables. In addition, we apply the DRL model to a real-world medical dataset\nMIMIC and demonstrate a detailed causal relationship between red cell width\ndistribution and mortality.\n","authors":["Yonghe Zhao","Qiang Huang","Haolong Zeng","Yun Pen","Huiyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.12625v1.pdf","comment":"15 pages,4 figures"},{"id":"http://arxiv.org/abs/2307.12617v1","updated":"2023-07-24T08:46:12Z","published":"2023-07-24T08:46:12Z","title":"Predicting Ordinary Differential Equations with Transformers","summary":" We develop a transformer-based sequence-to-sequence model that recovers\nscalar ordinary differential equations (ODEs) in symbolic form from irregularly\nsampled and noisy observations of a single solution trajectory. We demonstrate\nin extensive empirical evaluations that our model performs better or on par\nwith existing methods in terms of accurate recovery across various settings.\nMoreover, our method is efficiently scalable: after one-time pretraining on a\nlarge set of ODEs, we can infer the governing law of a new observed solution in\na few forward passes of the model.\n","authors":["Sören Becker","Michal Klein","Alexander Neitz","Giambattista Parascandolo","Niki Kilbertus"],"pdf_url":"https://arxiv.org/pdf/2307.12617v1.pdf","comment":"Published at ICML 2023"},{"id":"http://arxiv.org/abs/2307.09458v3","updated":"2023-07-24T08:32:40Z","published":"2023-07-18T17:39:04Z","title":"Does Circuit Analysis Interpretability Scale? Evidence from Multiple\n Choice Capabilities in Chinchilla","summary":" \\emph{Circuit analysis} is a promising technique for understanding the\ninternal mechanisms of language models. However, existing analyses are done in\nsmall models far from the state of the art. To address this, we present a case\nstudy of circuit analysis in the 70B Chinchilla model, aiming to test the\nscalability of circuit analysis. In particular, we study multiple-choice\nquestion answering, and investigate Chinchilla's capability to identify the\ncorrect answer \\emph{label} given knowledge of the correct answer \\emph{text}.\nWe find that the existing techniques of logit attribution, attention pattern\nvisualization, and activation patching naturally scale to Chinchilla, allowing\nus to identify and categorize a small set of `output nodes' (attention heads\nand MLPs).\n We further study the `correct letter' category of attention heads aiming to\nunderstand the semantics of their features, with mixed results. For normal\nmultiple-choice question answers, we significantly compress the query, key and\nvalue subspaces of the head without loss of performance when operating on the\nanswer labels for multiple-choice questions, and we show that the query and key\nsubspaces represent an `Nth item in an enumeration' feature to at least some\nextent. However, when we attempt to use this explanation to understand the\nheads' behaviour on a more general distribution including randomized answer\nlabels, we find that it is only a partial explanation, suggesting there is more\nto learn about the operation of `correct letter' heads on multiple choice\nquestion answering.\n","authors":["Tom Lieberum","Matthew Rahtz","János Kramár","Neel Nanda","Geoffrey Irving","Rohin Shah","Vladimir Mikulik"],"pdf_url":"https://arxiv.org/pdf/2307.09458v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12607v1","updated":"2023-07-24T08:32:27Z","published":"2023-07-24T08:32:27Z","title":"ExWarp: Extrapolation and Warping-based Temporal Supersampling for\n High-frequency Displays","summary":" High-frequency displays are gaining immense popularity because of their\nincreasing use in video games and virtual reality applications. However, the\nissue is that the underlying GPUs cannot continuously generate frames at this\nhigh rate -- this results in a less smooth and responsive experience.\nFurthermore, if the frame rate is not synchronized with the refresh rate, the\nuser may experience screen tearing and stuttering. Previous works propose\nincreasing the frame rate to provide a smooth experience on modern displays by\npredicting new frames based on past or future frames. Interpolation and\nextrapolation are two widely used algorithms that predict new frames.\nInterpolation requires waiting for the future frame to make a prediction, which\nadds additional latency. On the other hand, extrapolation provides a better\nquality of experience because it relies solely on past frames -- it does not\nincur any additional latency. The simplest method to extrapolate a frame is to\nwarp the previous frame using motion vectors; however, the warped frame may\ncontain improperly rendered visual artifacts due to dynamic objects -- this\nmakes it very challenging to design such a scheme. Past work has used DNNs to\nget good accuracy, however, these approaches are slow. This paper proposes\nExwarp -- an approach based on reinforcement learning (RL) to intelligently\nchoose between the slower DNN-based extrapolation and faster warping-based\nmethods to increase the frame rate by 4x with an almost negligible reduction in\nthe perceived image quality.\n","authors":["Akanksha Dixit","Yashashwee Chakrabarty","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2307.12607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12601v1","updated":"2023-07-24T08:21:13Z","published":"2023-07-24T08:21:13Z","title":"Concept backpropagation: An Explainable AI approach for visualising\n learned concepts in neural network models","summary":" Neural network models are widely used in a variety of domains, often as\nblack-box solutions, since they are not directly interpretable for humans. The\nfield of explainable artificial intelligence aims at developing explanation\nmethods to address this challenge, and several approaches have been developed\nover the recent years, including methods for investigating what type of\nknowledge these models internalise during the training process. Among these,\nthe method of concept detection, investigates which \\emph{concepts} neural\nnetwork models learn to represent in order to complete their tasks. In this\nwork, we present an extension to the method of concept detection, named\n\\emph{concept backpropagation}, which provides a way of analysing how the\ninformation representing a given concept is internalised in a given neural\nnetwork model. In this approach, the model input is perturbed in a manner\nguided by a trained concept probe for the described model, such that the\nconcept of interest is maximised. This allows for the visualisation of the\ndetected concept directly in the input space of the model, which in turn makes\nit possible to see what information the model depends on for representing the\ndescribed concept. We present results for this method applied to a various set\nof input modalities, and discuss how our proposed method can be used to\nvisualise what information trained concept probes use, and the degree as to\nwhich the representation of the probed concept is entangled within the neural\nnetwork model itself.\n","authors":["Patrik Hammersborg","Inga Strümke"],"pdf_url":"https://arxiv.org/pdf/2307.12601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12594v1","updated":"2023-07-24T08:11:59Z","published":"2023-07-24T08:11:59Z","title":"Optimized data collection and analysis process for studying\n solar-thermal desalination by machine learning","summary":" An effective interdisciplinary study between machine learning and\nsolar-thermal desalination requires a sufficiently large and well-analyzed\nexperimental datasets. This study develops a modified dataset collection and\nanalysis process for studying solar-thermal desalination by machine learning.\nBased on the optimized water condensation and collection process, the proposed\nexperimental method collects over one thousand datasets, which is ten times\nmore than the average number of datasets in previous works, by accelerating\ndata collection and reducing the time by 83.3%. On the other hand, the effects\nof dataset features are investigated by using three different algorithms,\nincluding artificial neural networks, multiple linear regressions, and random\nforests. The investigation focuses on the effects of dataset size and range on\nprediction accuracy, factor importance ranking, and the model's generalization\nability. The results demonstrate that a larger dataset can significantly\nimprove prediction accuracy when using artificial neural networks and random\nforests. Additionally, the study highlights the significant impact of dataset\nsize and range on ranking the importance of influence factors. Furthermore, the\nstudy reveals that the extrapolation data range significantly affects the\nextrapolation accuracy of artificial neural networks. Based on the results,\nmassive dataset collection and analysis of dataset feature effects are\nimportant steps in an effective and consistent machine learning process flow\nfor solar-thermal desalination, which can promote machine learning as a more\ngeneral tool in the field of solar-thermal desalination.\n","authors":["Guilong Peng","Senshan Sun","Yangjun Qin","Zhenwei Xu","Juxin Du","Swellam W. sharshir","A. W. Kandel","A. E. Kabeel","Nuo Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07515v2","updated":"2023-07-24T08:10:52Z","published":"2023-04-15T09:39:52Z","title":"S3M: Scalable Statistical Shape Modeling through Unsupervised\n Correspondences","summary":" Statistical shape models (SSMs) are an established way to represent the\nanatomy of a population with various clinically relevant applications. However,\nthey typically require domain expertise, and labor-intensive landmark\nannotations to construct. We address these shortcomings by proposing an\nunsupervised method that leverages deep geometric features and functional\ncorrespondences to simultaneously learn local and global shape structures\nacross population anatomies. Our pipeline significantly improves unsupervised\ncorrespondence estimation for SSMs compared to baseline methods, even on highly\nirregular surface topologies. We demonstrate this for two different anatomical\nstructures: the thyroid and a multi-chamber heart dataset. Furthermore, our\nmethod is robust enough to learn from noisy neural network predictions,\npotentially enabling scaling SSMs to larger patient populations without manual\nsegmentation annotation.\n","authors":["Lennart Bastian","Alexander Baumann","Emily Hoppe","Vincent Bürgin","Ha Young Kim","Mahdi Saleh","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2304.07515v2.pdf","comment":"Accepted at MICCAI 2023. 13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12586v1","updated":"2023-07-24T07:58:18Z","published":"2023-07-24T07:58:18Z","title":"InVAErt networks: a data-driven framework for emulation, inference and\n identifiability analysis","summary":" Use of generative models and deep learning for physics-based systems is\ncurrently dominated by the task of emulation. However, the remarkable\nflexibility offered by data-driven architectures would suggest to extend this\nrepresentation to other aspects of system synthesis including model inversion\nand identifiability. We introduce inVAErt (pronounced \\emph{invert}) networks,\na comprehensive framework for data-driven analysis and synthesis of parametric\nphysical systems which uses a deterministic encoder and decoder to represent\nthe forward and inverse solution maps, normalizing flow to capture the\nprobabilistic distribution of system outputs, and a variational encoder\ndesigned to learn a compact latent representation for the lack of bijectivity\nbetween inputs and outputs. We formally investigate the selection of penalty\ncoefficients in the loss function and strategies for latent space sampling,\nsince we find that these significantly affect both training and testing\nperformance. We validate our framework through extensive numerical examples,\nincluding simple linear, nonlinear, and periodic maps, dynamical systems, and\nspatio-temporal PDEs.\n","authors":["Guoxiang Grayson Tong","Carlos A. Sing Long","Daniele E. Schiavazzi"],"pdf_url":"https://arxiv.org/pdf/2307.12586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09087v3","updated":"2023-07-24T07:55:19Z","published":"2023-06-15T12:33:39Z","title":"Deep learning based Meta-modeling for Multi-objective Technology\n Optimization of Electrical Machines","summary":" Optimization of rotating electrical machines is both time- and\ncomputationally expensive. Because of the different parametrization, design\noptimization is commonly executed separately for each machine technology. In\nthis paper, we present the application of a variational auto-encoder (VAE) to\noptimize two different machine technologies simultaneously, namely an\nasynchronous machine and a permanent magnet synchronous machine. After\ntraining, we employ a deep neural network and a decoder as meta-models to\npredict global key performance indicators (KPIs) and generate associated new\ndesigns, respectively, through unified latent space in the optimization loop.\nNumerical results demonstrate concurrent parametric multi-objective technology\noptimization in the high-dimensional design space. The VAE-based approach is\nquantitatively compared to a classical deep learning-based direct approach for\nKPIs prediction.\n","authors":["Vivek Parekh","Dominik Flore","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2306.09087v3.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2307.12576v1","updated":"2023-07-24T07:47:21Z","published":"2023-07-24T07:47:21Z","title":"Self-refining of Pseudo Labels for Music Source Separation with Noisy\n Labeled Data","summary":" Music source separation (MSS) faces challenges due to the limited\navailability of correctly-labeled individual instrument tracks. With the push\nto acquire larger datasets to improve MSS performance, the inevitability of\nencountering mislabeled individual instrument tracks becomes a significant\nchallenge to address. This paper introduces an automated technique for refining\nthe labels in a partially mislabeled dataset. Our proposed self-refining\ntechnique, employed with a noisy-labeled dataset, results in only a 1% accuracy\ndegradation in multi-label instrument recognition compared to a classifier\ntrained on a clean-labeled dataset. The study demonstrates the importance of\nrefining noisy-labeled data in MSS model training and shows that utilizing the\nrefined dataset leads to comparable results derived from a clean-labeled\ndataset. Notably, upon only access to a noisy dataset, MSS models trained on a\nself-refined dataset even outperform those trained on a dataset refined with a\nclassifier trained on clean labels.\n","authors":["Junghyun Koo","Yunkee Chae","Chang-Bin Jeon","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12576v1.pdf","comment":"24th International Society for Music Information Retrieval Conference\n (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2306.16264v2","updated":"2023-07-24T07:30:53Z","published":"2023-06-28T14:46:55Z","title":"Deep Unfolded Simulated Bifurcation for Massive MIMO Signal Detection","summary":" Multiple-input multiple-output (MIMO) is a key ingredient of next-generation\nwireless communications. Recently, various MIMO signal detectors based on deep\nlearning techniques and quantum(-inspired) algorithms have been proposed to\nimprove the detection performance compared with conventional detectors. This\npaper focuses on the simulated bifurcation (SB) algorithm, a quantum-inspired\nalgorithm. This paper proposes two techniques to improve its detection\nperformance. The first is modifying the algorithm inspired by the\nLevenberg-Marquardt algorithm to eliminate local minima of maximum likelihood\ndetection. The second is the use of deep unfolding, a deep learning technique\nto train the internal parameters of an iterative algorithm. We propose a\ndeep-unfolded SB by making the update rule of SB differentiable. The numerical\nresults show that these proposed detectors significantly improve the signal\ndetection performance in massive MIMO systems.\n","authors":["Satoshi Takabe"],"pdf_url":"https://arxiv.org/pdf/2306.16264v2.pdf","comment":"5pages, 4 figures; codes are available at\n https://github.com/s-takabe/unfolded_simbif"},{"id":"http://arxiv.org/abs/2307.12564v1","updated":"2023-07-24T07:17:33Z","published":"2023-07-24T07:17:33Z","title":"Towards Generalising Neural Topical Representations","summary":" Topic models have evolved from conventional Bayesian probabilistic models to\nNeural Topic Models (NTMs) over the last two decays. Although NTMs have\nachieved promising performance when trained and tested on a specific corpus,\ntheir generalisation ability across corpora is rarely studied. In practice, we\noften expect that an NTM trained on a source corpus can still produce quality\ntopical representation for documents in a different target corpus without\nretraining. In this work, we aim to improve NTMs further so that their benefits\ngeneralise reliably across corpora and tasks. To do so, we propose to model\nsimilar documents by minimising their semantical distance when training NTMs.\nSpecifically, similar documents are created by data augmentation during\ntraining; The semantical distance between documents is measured by the\nHierarchical Topic Transport Distance (HOTT), which computes the Optimal\nTransport (OT) distance between the topical representations. Our framework can\nbe readily applied to most NTMs as a plug-and-play module. Extensive\nexperiments show that our framework significantly improves the generalisation\nability regarding neural topical representation across corpora.\n","authors":["Xiaohao Yang","He Zhao","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2307.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09251v2","updated":"2023-07-24T07:08:59Z","published":"2022-11-16T22:50:40Z","title":"Learning-Augmented B-Trees","summary":" We study learning-augmented binary search trees (BSTs) and B-Trees via Treaps\nwith composite priorities. The result is a simple search tree where the depth\nof each item is determined by its predicted weight $w_x$. To achieve the\nresult, each item $x$ has its composite priority\n$-\\lfloor\\log\\log(1/w_x)\\rfloor + U(0, 1)$ where $U(0, 1)$ is the uniform\nrandom variable. This generalizes the recent learning-augmented BSTs\n[Lin-Luo-Woodruff ICML`22], which only work for Zipfian distributions, to\narbitrary inputs and predictions. It also gives the first B-Tree data structure\nthat can provably take advantage of localities in the access sequence via\nonline self-reorganization. The data structure is robust to prediction errors\nand handles insertions, deletions, as well as prediction updates.\n","authors":["Xinyuan Cao","Jingbang Chen","Li Chen","Chris Lambert","Richard Peng","Daniel Sleator"],"pdf_url":"https://arxiv.org/pdf/2211.09251v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2307.10617v3","updated":"2023-07-24T07:03:01Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":" In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12555v1","updated":"2023-07-24T06:41:59Z","published":"2023-07-24T06:41:59Z","title":"Homophily-Driven Sanitation View for Robust Graph Contrastive Learning","summary":" We investigate adversarial robustness of unsupervised Graph Contrastive\nLearning (GCL) against structural attacks. First, we provide a comprehensive\nempirical and theoretical analysis of existing attacks, revealing how and why\nthey downgrade the performance of GCL. Inspired by our analytic results, we\npresent a robust GCL framework that integrates a homophily-driven sanitation\nview, which can be learned jointly with contrastive learning. A key challenge\nthis poses, however, is the non-differentiable nature of the sanitation\nobjective. To address this challenge, we propose a series of techniques to\nenable gradient-based end-to-end robust GCL. Moreover, we develop a fully\nunsupervised hyperparameter tuning method which, unlike prior approaches, does\nnot require knowledge of node labels. We conduct extensive experiments to\nevaluate the performance of our proposed model, GCHS (Graph Contrastive\nLearning with Homophily-driven Sanitation View), against two state of the art\nstructural attacks on GCL. Our results demonstrate that GCHS consistently\noutperforms all state of the art baselines in terms of the quality of generated\nnode embeddings as well as performance on two important downstream tasks.\n","authors":["Yulin Zhu","Xing Ai","Yevgeniy Vorobeychik","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12551v1","updated":"2023-07-24T06:38:10Z","published":"2023-07-24T06:38:10Z","title":"Continuation Path Learning for Homotopy Optimization","summary":" Homotopy optimization is a traditional method to deal with a complicated\noptimization problem by solving a sequence of easy-to-hard surrogate\nsubproblems. However, this method can be very sensitive to the continuation\nschedule design and might lead to a suboptimal solution to the original\nproblem. In addition, the intermediate solutions, often ignored by classic\nhomotopy optimization, could be useful for many real-world applications. In\nthis work, we propose a novel model-based approach to learn the whole\ncontinuation path for homotopy optimization, which contains infinite\nintermediate solutions for any surrogate subproblems. Rather than the classic\nunidirectional easy-to-hard optimization, our method can simultaneously\noptimize the original problem and all surrogate subproblems in a collaborative\nmanner. The proposed model also supports real-time generation of any\nintermediate solution, which could be desirable for many applications.\nExperimental studies on different problems show that our proposed method can\nsignificantly improve the performance of homotopy optimization and provide\nextra helpful information to support better decision-making.\n","authors":["Xi Lin","Zhiyuan Yang","Xiaoyuan Zhang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12551v1.pdf","comment":"Accepted by the 40th International Conference on Machine Learning\n (ICML 2023)"},{"id":"http://arxiv.org/abs/2304.12438v2","updated":"2023-07-24T06:19:17Z","published":"2023-04-24T20:24:07Z","title":"Stochastic MPC for energy hubs using data driven demand forecasting","summary":" Energy hubs convert and distribute energy resources by combining different\nenergy inputs through multiple conversion and storage components. The optimal\noperation of the energy hub exploits its flexibility to increase the energy\nefficiency and reduce the operational costs. However, uncertainties in the\ndemand present challenges to energy hub optimization. In this paper, we propose\na stochastic MPC controller to minimize energy costs using chance constraints\nfor the uncertain electricity and thermal demands. Historical data is used to\nbuild a demand prediction model based on Gaussian processes to generate a\nforecast of the future electricity and heat demands. The stochastic\noptimization problem is solved via the Scenario Approach by sampling multi-step\ndemand trajectories from the derived prediction model. The performance of the\nproposed predictor and of the stochastic controller is verified on a simulated\nenergy hub model and demand data from a real building.\n","authors":["Varsha Behrunani","Francesco Micheli","Jonas Mehr","Philipp Heer","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2304.12438v2.pdf","comment":"6 pages, 5 figures. Submitted to IFAC World Congress 2023"},{"id":"http://arxiv.org/abs/2211.09710v3","updated":"2023-07-24T05:39:27Z","published":"2022-11-17T17:45:59Z","title":"Style Classification of Rabbinic Literature for Detection of Lost\n Midrash Tanhuma Material","summary":" Midrash collections are complex rabbinic works that consist of text in\nmultiple languages, which evolved through long processes of unstable oral and\nwritten transmission. Determining the origin of a given passage in such a\ncompilation is not always straightforward and is often a matter of dispute\namong scholars, yet it is essential for scholars' understanding of the passage\nand its relationship to other texts in the rabbinic corpus. To help solve this\nproblem, we propose a system for classification of rabbinic literature based on\nits style, leveraging recent advances in natural language processing for Hebrew\ntexts. Additionally, we demonstrate how this method can be applied to uncover\nlost material from a specific midrash genre, Tan\\d{h}uma-Yelammedenu, that has\nbeen preserved in later anthologies.\n","authors":["Shlomo Tannor","Nachum Dershowitz","Moshe Lavee"],"pdf_url":"https://arxiv.org/pdf/2211.09710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12532v1","updated":"2023-07-24T05:36:19Z","published":"2023-07-24T05:36:19Z","title":"On the Connection between Pre-training Data Diversity and Fine-tuning\n Robustness","summary":" Pre-training has been widely adopted in deep learning to improve model\nperformance, especially when the training data for a target task is limited. In\nour work, we seek to understand the implications of this training strategy on\nthe generalization properties of downstream models. More specifically, we ask\nthe following question: how do properties of the pre-training distribution\naffect the robustness of a fine-tuned model? The properties we explore include\nthe label space, label semantics, image diversity, data domains, and data\nquantity of the pre-training distribution. We find that the primary factor\ninfluencing downstream effective robustness (Taori et al., 2020) is data\nquantity, while other factors have limited significance. For example, reducing\nthe number of ImageNet pre-training classes by 4x while increasing the number\nof images per class by 4x (that is, keeping total data quantity fixed) does not\nimpact the robustness of fine-tuned models. We demonstrate our findings on\npre-training distributions drawn from various natural and synthetic data\nsources, primarily using the iWildCam-WILDS distribution shift as a test for\ndownstream robustness.\n","authors":["Vivek Ramanujan","Thao Nguyen","Sewoong Oh","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2307.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12526v1","updated":"2023-07-24T04:56:23Z","published":"2023-07-24T04:56:23Z","title":"Rethinking Medical Report Generation: Disease Revealing Enhancement with\n Knowledge Graph","summary":" Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)\nbecause it reveals the relations among diseases and thus can be utilized to\nguide the generation process. However, constructing a comprehensive KG is\nlabor-intensive and its applications on the MRG process are under-explored. In\nthis study, we establish a complete KG on chest X-ray imaging that includes 137\ntypes of diseases and abnormalities. Based on this KG, we find that the current\nMRG data sets exhibit a long-tailed problem in disease distribution. To\nmitigate this problem, we introduce a novel augmentation strategy that enhances\nthe representation of disease types in the tail-end of the distribution. We\nfurther design a two-stage MRG approach, where a classifier is first trained to\ndetect whether the input images exhibit any abnormalities. The classified\nimages are then independently fed into two transformer-based generators,\nnamely, ``disease-specific generator\" and ``disease-free generator\" to generate\nthe corresponding reports. To enhance the clinical evaluation of whether the\ngenerated reports correctly describe the diseases appearing in the input image,\nwe propose diverse sensitivity (DS), a new metric that checks whether generated\ndiseases match ground truth and measures the diversity of all generated\ndiseases. Results show that the proposed two-stage generation framework and\naugmentation strategies improve DS by a considerable margin, indicating a\nnotable reduction in the long-tailed problem associated with under-represented\ndiseases.\n","authors":["Yixin Wang","Zihao Lin","Haoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2307.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12524v1","updated":"2023-07-24T04:46:22Z","published":"2023-07-24T04:46:22Z","title":"Landslide Surface Displacement Prediction Based on VSXC-LSTM Algorithm","summary":" Landslide is a natural disaster that can easily threaten local ecology,\npeople's lives and property. In this paper, we conduct modelling research on\nreal unidirectional surface displacement data of recent landslides in the\nresearch area and propose a time series prediction framework named\nVMD-SegSigmoid-XGBoost-ClusterLSTM (VSXC-LSTM) based on variational mode\ndecomposition, which can predict the landslide surface displacement more\naccurately. The model performs well on the test set. Except for the random item\nsubsequence that is hard to fit, the root mean square error (RMSE) and the mean\nabsolute percentage error (MAPE) of the trend item subsequence and the periodic\nitem subsequence are both less than 0.1, and the RMSE is as low as 0.006 for\nthe periodic item prediction module based on XGBoost\\footnote{Accepted in\nICANN2023}.\n","authors":["Menglin Kong","Ruichen Li","Fan Liu","Xingquan Li","Juan Cheng","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12520v1","updated":"2023-07-24T04:29:43Z","published":"2023-07-24T04:29:43Z","title":"Lost In Translation: Generating Adversarial Examples Robust to\n Round-Trip Translation","summary":" Language Models today provide a high accuracy across a large number of\ndownstream tasks. However, they remain susceptible to adversarial attacks,\nparticularly against those where the adversarial examples maintain considerable\nsimilarity to the original text. Given the multilingual nature of text, the\neffectiveness of adversarial examples across translations and how machine\ntranslations can improve the robustness of adversarial examples remain largely\nunexplored. In this paper, we present a comprehensive study on the robustness\nof current text adversarial attacks to round-trip translation. We demonstrate\nthat 6 state-of-the-art text-based adversarial attacks do not maintain their\nefficacy after round-trip translation. Furthermore, we introduce an\nintervention-based solution to this problem, by integrating Machine Translation\ninto the process of adversarial example generation and demonstrating increased\nrobustness to round-trip translation. Our results indicate that finding\nadversarial examples robust to translation can help identify the insufficiency\nof language models that is common across languages, and motivate further\nresearch into multilingual adversarial attacks.\n","authors":["Neel Bhandari","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12520v1.pdf","comment":"Published at International Conference on Acoustics, Speech, and\n Signal Processing (ICASSP) 2023"},{"id":"http://arxiv.org/abs/2307.12519v1","updated":"2023-07-24T04:29:00Z","published":"2023-07-24T04:29:00Z","title":"DEPHN: Different Expression Parallel Heterogeneous Network using virtual\n gradient optimization for Multi-task Learning","summary":" Recommendation system algorithm based on multi-task learning (MTL) is the\nmajor method for Internet operators to understand users and predict their\nbehaviors in the multi-behavior scenario of platform. Task correlation is an\nimportant consideration of MTL goals, traditional models use shared-bottom\nmodels and gating experts to realize shared representation learning and\ninformation differentiation. However, The relationship between real-world tasks\nis often more complex than existing methods do not handle properly sharing\ninformation. In this paper, we propose an Different Expression Parallel\nHeterogeneous Network (DEPHN) to model multiple tasks simultaneously. DEPHN\nconstructs the experts at the bottom of the model by using different feature\ninteraction methods to improve the generalization ability of the shared\ninformation flow. In view of the model's differentiating ability for different\ntask information flows, DEPHN uses feature explicit mapping and virtual\ngradient coefficient for expert gating during the training process, and\nadaptively adjusts the learning intensity of the gated unit by considering the\ndifference of gating values and task correlation. Extensive experiments on\nartificial and real-world datasets demonstrate that our proposed method can\ncapture task correlation in complex situations and achieve better performance\nthan baseline models\\footnote{Accepted in IJCNN2023}.\n","authors":["Menglin Kong","Ri Su","Shaojie Zhao","Muzhou Hou"],"pdf_url":"https://arxiv.org/pdf/2307.12519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12518v1","updated":"2023-07-24T04:23:08Z","published":"2023-07-24T04:23:08Z","title":"FaFCNN: A General Disease Classification Framework Based on Feature\n Fusion Neural Networks","summary":" There are two fundamental problems in applying deep learning/machine learning\nmethods to disease classification tasks, one is the insufficient number and\npoor quality of training samples; another one is how to effectively fuse\nmultiple source features and thus train robust classification models. To\naddress these problems, inspired by the process of human learning knowledge, we\npropose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which\nintroduces a feature-aware interaction module and a feature alignment module\nbased on domain adversarial learning. This is a general framework for disease\nclassification, and FaFCNN improves the way existing methods obtain sample\ncorrelation features. The experimental results show that training using\naugmented features obtained by pre-training gradient boosting decision tree\nyields more performance gains than random-forest based methods. On the\nlow-quality dataset with a large amount of missing data in our setup, FaFCNN\nobtains a consistently optimal performance compared to competitive baselines.\nIn addition, extensive experiments demonstrate the robustness of the proposed\nmethod and the effectiveness of each component of the model\\footnote{Accepted\nin IEEE SMC2023}.\n","authors":["Menglin Kong","Shaojie Zhao","Juan Cheng","Xingquan Li","Ri Su","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12510v1","updated":"2023-07-24T03:52:11Z","published":"2023-07-24T03:52:11Z","title":"An Empirical Evaluation of Temporal Graph Benchmark","summary":" In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark\n(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with\nTGB, we include eleven popular dynamic graph learning methods for more\nexhaustive comparisons. Through the experiments, we find that (1) some issues\nneed to be addressed in the current version of TGB, including mismatched data\nstatistics, inaccurate evaluation metric computation, and so on; (2) different\nmodels depict varying performance across various datasets, which is in line\nwith previous observations; (3) the performance of some baselines can be\nsignificantly improved over the reported results in TGB when using DyGLib. This\nwork aims to ease the researchers' efforts in evaluating various dynamic graph\nlearning methods on TGB and attempts to offer results that can be directly\nreferenced in the follow-up research. All the used resources in this project\nare publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is\nin progress, and feedback from the community is welcomed for improvements.\n","authors":["Le Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12510v1.pdf","comment":"preprint, in progress"},{"id":"http://arxiv.org/abs/2304.03483v2","updated":"2023-07-24T03:28:34Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n Dynamic Imaging","summary":" Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\nRegularization by Denoising (RED), which provides a flexible framework to\nexploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nthe performance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12499v1","updated":"2023-07-24T03:10:02Z","published":"2023-07-24T03:10:02Z","title":"AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion\n Models","summary":" Unrestricted adversarial attacks present a serious threat to deep learning\nmodels and adversarial defense techniques. They pose severe security problems\nfor deep learning applications because they can effectively bypass defense\nmechanisms. However, previous attack methods often utilize Generative\nAdversarial Networks (GANs), which are not theoretically provable and thus\ngenerate unrealistic examples by incorporating adversarial objectives,\nespecially for large-scale datasets like ImageNet. In this paper, we propose a\nnew method, called AdvDiff, to generate unrestricted adversarial examples with\ndiffusion models. We design two novel adversarial guidance techniques to\nconduct adversarial sampling in the reverse generation process of diffusion\nmodels. These two techniques are effective and stable to generate high-quality,\nrealistic adversarial examples by integrating gradients of the target\nclassifier interpretably. Experimental results on MNIST and ImageNet datasets\ndemonstrate that AdvDiff is effective to generate unrestricted adversarial\nexamples, which outperforms GAN-based methods in terms of attack performance\nand generation quality.\n","authors":["Xuelong Dai","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.12499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12496v1","updated":"2023-07-24T03:04:10Z","published":"2023-07-24T03:04:10Z","title":"A faster and simpler algorithm for learning shallow networks","summary":" We revisit the well-studied problem of learning a linear combination of $k$\nReLU activations given labeled examples drawn from the standard $d$-dimensional\nGaussian measure. Chen et al. [CDG+23] recently gave the first algorithm for\nthis problem to run in $\\text{poly}(d,1/\\varepsilon)$ time when $k = O(1)$,\nwhere $\\varepsilon$ is the target error. More precisely, their algorithm runs\nin time $(d/\\varepsilon)^{\\mathrm{quasipoly}(k)}$ and learns over multiple\nstages. Here we show that a much simpler one-stage version of their algorithm\nsuffices, and moreover its runtime is only $(d/\\varepsilon)^{O(k^2)}$.\n","authors":["Sitan Chen","Shyam Narayanan"],"pdf_url":"https://arxiv.org/pdf/2307.12496v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2307.12491v1","updated":"2023-07-24T02:50:19Z","published":"2023-07-24T02:50:19Z","title":"Learning Universal and Robust 3D Molecular Representations with Graph\n Convolutional Networks","summary":" To learn accurate representations of molecules, it is essential to consider\nboth chemical and geometric features. To encode geometric information, many\ndescriptors have been proposed in constrained circumstances for specific types\nof molecules and do not have the properties to be ``robust\": 1. Invariant to\nrotations and translations; 2. Injective when embedding molecular structures.\nIn this work, we propose a universal and robust Directional Node Pair (DNP)\ndescriptor based on the graph representations of 3D molecules. Our DNP\ndescriptor is robust compared to previous ones and can be applied to multiple\nmolecular types. To combine the DNP descriptor and chemical features in\nmolecules, we construct the Robust Molecular Graph Convolutional Network\n(RoM-GCN) which is capable to take both node and edge features into\nconsideration when generating molecule representations. We evaluate our model\non protein and small molecule datasets. Our results validate the superiority of\nthe DNP descriptor in incorporating 3D geometric information of molecules.\nRoM-GCN outperforms all compared baselines.\n","authors":["Shuo Zhang","Yang Liu","Li Xie","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2307.12491v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2307.01482v2","updated":"2023-07-24T02:40:29Z","published":"2023-07-04T05:19:19Z","title":"Nexus sine qua non: Essentially Connected Networks for Traffic\n Forecasting","summary":" Spatial-temporal graph neural networks (STGNNs) have become the de facto\nmodels for learning spatiotemporal representations of traffic flow. However,\nmodern STGNNs often contain superfluous or obscure components, along with\ncomplex techniques, posing significant challenges in terms of complexity and\nscalability. Such concerns prompt us to rethink the design of neural\narchitectures and to identify the key challenges in traffic forecasting as\nspatial-temporal contextualization. Here, we present an essentially connected\nmodel based on an efficient message-passing backbone, powered by learnable node\nembedding, without any complex sequential techniques such as TCNs, RNNs, and\nTransformers. Intriguingly, empirical results demonstrate how a simple and\nelegant model with contextualization capability compares favorably w.r.t. the\nstate-of-the-art with elaborate structures, while being much more interpretable\nand computationally efficient for traffic forecasting. We anticipate that our\nfindings will open new horizons for further research to explore the possibility\nof creating simple but effective neural forecasting architectures.\n","authors":["Tong Nie","Guoyang Qin","Yunpeng Wang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2307.01482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04893v2","updated":"2023-07-24T02:38:09Z","published":"2023-07-10T20:31:23Z","title":"Choosing Well Your Opponents: How to Guide the Synthesis of Programmatic\n Strategies","summary":" This paper introduces Local Learner (2L), an algorithm for providing a set of\nreference strategies to guide the search for programmatic strategies in\ntwo-player zero-sum games. Previous learning algorithms, such as Iterated Best\nResponse (IBR), Fictitious Play (FP), and Double-Oracle (DO), can be\ncomputationally expensive or miss important information for guiding search\nalgorithms. 2L actively selects a set of reference strategies to improve the\nsearch signal. We empirically demonstrate the advantages of our approach while\nguiding a local search algorithm for synthesizing strategies in three games,\nincluding MicroRTS, a challenging real-time strategy game. Results show that 2L\nlearns reference strategies that provide a stronger search signal than IBR, FP,\nand DO. We also simulate a tournament of MicroRTS, where a synthesizer using 2L\noutperformed the winners of the two latest MicroRTS competitions, which were\nprogrammatic strategies written by human programmers.\n","authors":["Rubens O. Moraes","David S. Aleixo","Lucas N. Ferreira","Levi H. S. Lelis"],"pdf_url":"https://arxiv.org/pdf/2307.04893v2.pdf","comment":"International Joint Conference on Artificial Intelligence (IJCAI)\n 2023"},{"id":"http://arxiv.org/abs/2307.12480v1","updated":"2023-07-24T02:28:50Z","published":"2023-07-24T02:28:50Z","title":"Learning Resource Allocation Policy: Vertex-GNN or Edge-GNN?","summary":" Graph neural networks (GNNs) update the hidden representations of vertices\n(called Vertex-GNNs) or hidden representations of edges (called Edge-GNNs) by\nprocessing and pooling the information of neighboring vertices and edges and\ncombining to incorporate graph topology. When learning resource allocation\npolicies, GNNs cannot perform well if their expressive power are weak, i.e., if\nthey cannot differentiate all input features such as channel matrices. In this\npaper, we analyze the expressive power of the Vertex-GNNs and Edge-GNNs for\nlearning three representative wireless policies: link scheduling, power\ncontrol, and precoding policies. We find that the expressive power of the GNNs\ndepend on the linearity and output dimensions of the processing and combination\nfunctions. When linear processors are used, the Vertex-GNNs cannot\ndifferentiate all channel matrices due to the loss of channel information,\nwhile the Edge-GNNs can. When learning the precoding policy, even the\nVertex-GNNs with non-linear processors may not be with strong expressive\nability due to the dimension compression. We proceed to provide necessary\nconditions for the GNNs to well learn the precoding policy. Simulation results\nvalidate the analyses and show that the Edge-GNNs can achieve the same\nperformance as the Vertex-GNNs with much lower training and inference time.\n","authors":["Yao Peng","Jia Guo","Chenyang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16392v2","updated":"2023-07-24T02:05:50Z","published":"2022-10-28T20:13:00Z","title":"Physics-aware Graph Neural Network for Accurate RNA 3D Structure\n Prediction","summary":" Biological functions of RNAs are determined by their three-dimensional (3D)\nstructures. Thus, given the limited number of experimentally determined RNA\nstructures, the prediction of RNA structures will facilitate elucidating RNA\nfunctions and RNA-targeted drug discovery, but remains a challenging task. In\nthis work, we propose a Graph Neural Network (GNN)-based scoring function\ntrained only with the atomic types and coordinates on limited solved RNA 3D\nstructures for distinguishing accurate structural models. The proposed\nPhysics-aware Multiplex Graph Neural Network (PaxNet) separately models the\nlocal and non-local interactions inspired by molecular mechanics. Furthermore,\nPaxNet contains an attention-based fusion module that learns the individual\ncontribution of each interaction type for the final prediction. We rigorously\nevaluate the performance of PaxNet on two benchmarks and compare it with\nseveral state-of-the-art baselines. The results show that PaxNet significantly\noutperforms all the baselines overall, and demonstrate the potential of PaxNet\nfor improving the 3D structure modeling of RNA and other macromolecules. Our\ncode is available at https://github.com/zetayue/Physics-aware-Multiplex-GNN.\n","authors":["Shuo Zhang","Yang Liu","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2210.16392v2.pdf","comment":"Accepted by the Machine Learning for Structural Biology Workshop\n (MLSB) at the 36th Conference on Neural Information Processing Systems\n (NeurIPS 2022)"},{"id":"http://arxiv.org/abs/2307.12472v1","updated":"2023-07-24T01:58:48Z","published":"2023-07-24T01:58:48Z","title":"Model-free generalized fiducial inference","summary":" Motivated by the need for the development of safe and reliable methods for\nuncertainty quantification in machine learning, I propose and develop ideas for\na model-free statistical framework for imprecise probabilistic prediction\ninference. This framework facilitates uncertainty quantification in the form of\nprediction sets that offer finite sample control of type 1 errors, a property\nshared with conformal prediction sets, but this new approach also offers more\nversatile tools for imprecise probabilistic reasoning. Furthermore, I propose\nand consider the theoretical and empirical properties of a precise\nprobabilistic approximation to the model-free imprecise framework.\nApproximating a belief/plausibility measure pair by an [optimal in some sense]\nprobability measure in the credal set is a critical resolution needed for the\nbroader adoption of imprecise probabilistic approaches to inference in\nstatistical and machine learning communities. It is largely undetermined in the\nstatistical and machine learning literatures, more generally, how to properly\nquantify uncertainty in that there is no generally accepted standard of\naccountability of stated uncertainties. The research I present in this\nmanuscript is aimed at motivating a framework for statistical inference with\nreliability and accountability as the guiding principles.\n","authors":["Jonathan P Williams"],"pdf_url":"https://arxiv.org/pdf/2307.12472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12463v1","updated":"2023-07-24T00:53:46Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":" Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12461v1","updated":"2023-07-24T00:16:50Z","published":"2023-07-24T00:16:50Z","title":"Rates of Approximation by ReLU Shallow Neural Networks","summary":" Neural networks activated by the rectified linear unit (ReLU) play a central\nrole in the recent development of deep learning. The topic of approximating\nfunctions from H\\\"older spaces by these networks is crucial for understanding\nthe efficiency of the induced learning algorithms. Although the topic has been\nwell investigated in the setting of deep neural networks with many layers of\nhidden neurons, it is still open for shallow networks having only one hidden\nlayer. In this paper, we provide rates of uniform approximation by these\nnetworks. We show that ReLU shallow neural networks with $m$ hidden neurons can\nuniformly approximate functions from the H\\\"older space $W_\\infty^r([-1, 1]^d)$\nwith rates $O((\\log m)^{\\frac{1}{2} +d}m^{-\\frac{r}{d}\\frac{d+2}{d+4}})$ when\n$r0.81, accuracy>0.90, and F1 score>0.90 in two of three datasets.\nInterestingly, the few-shot models outperformed the fine-tuned ones by 20% in\nboth Accuracy and F1 score for the YouTube Pseudoscience dataset, highlighting\nthe potential utility of this approach -- especially in the context of limited\ntraining data.\n","authors":["Christos Christodoulou","Nikos Salamanos","Pantelitsa Leonidou","Michail Papadakis","Michael Sirivianos"],"pdf_url":"https://arxiv.org/pdf/2307.12155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12134v1","updated":"2023-07-22T17:47:31Z","published":"2023-07-22T17:47:31Z","title":"Modality Confidence Aware Training for Robust End-to-End Spoken Language\n Understanding","summary":" End-to-end (E2E) spoken language understanding (SLU) systems that generate a\nsemantic parse from speech have become more promising recently. This approach\nuses a single model that utilizes audio and text representations from\npre-trained speech recognition models (ASR), and outperforms traditional\npipeline SLU systems in on-device streaming scenarios. However, E2E SLU systems\nstill show weakness when text representation quality is low due to ASR\ntranscription errors. To overcome this issue, we propose a novel E2E SLU system\nthat enhances robustness to ASR errors by fusing audio and text representations\nbased on the estimated modality confidence of ASR hypotheses. We introduce two\nnovel techniques: 1) an effective method to encode the quality of ASR\nhypotheses and 2) an effective approach to integrate them into E2E SLU models.\nWe show accuracy improvements on STOP dataset and share the analysis to\ndemonstrate the effectiveness of our approach.\n","authors":["Suyoun Kim","Akshat Shrivastava","Duc Le","Ju Lin","Ozlem Kalinli","Michael L. Seltzer"],"pdf_url":"https://arxiv.org/pdf/2307.12134v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.12131v1","updated":"2023-07-22T17:26:55Z","published":"2023-07-22T17:26:55Z","title":"Explainable Topic-Enhanced Argument Mining from Heterogeneous Sources","summary":" Given a controversial target such as ``nuclear energy'', argument mining aims\nto identify the argumentative text from heterogeneous sources. Current\napproaches focus on exploring better ways of integrating the target-associated\nsemantic information with the argumentative text. Despite their empirical\nsuccesses, two issues remain unsolved: (i) a target is represented by a word or\na phrase, which is insufficient to cover a diverse set of target-related\nsubtopics; (ii) the sentence-level topic information within an argument, which\nwe believe is crucial for argument mining, is ignored. To tackle the above\nissues, we propose a novel explainable topic-enhanced argument mining approach.\nSpecifically, with the use of the neural topic model and the language model,\nthe target information is augmented by explainable topic representations.\nMoreover, the sentence-level topic information within the argument is captured\nby minimizing the distance between its latent topic distribution and its\nsemantic representation through mutual learning. Experiments have been\nconducted on the benchmark dataset in both the in-target setting and the\ncross-target setting. Results demonstrate the superiority of the proposed model\nagainst the state-of-the-art baselines.\n","authors":["Jiasheng Si","Yingjie Zhu","Xingyu Shi","Deyu Zhou","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2307.12131v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12114v1","updated":"2023-07-22T15:58:17Z","published":"2023-07-22T15:58:17Z","title":"A Zero-shot and Few-shot Study of Instruction-Finetuned Large Language\n Models Applied to Clinical and Biomedical Tasks","summary":" We evaluate four state-of-the-art instruction-tuned large language models\n(LLMs) -- ChatGPT, Flan-T5 UL2, Tk-Instruct, and Alpaca -- on a set of 13\nreal-world clinical and biomedical natural language processing (NLP) tasks in\nEnglish, such as named-entity recognition (NER), question-answering (QA),\nrelation extraction (RE), etc. Our overall results demonstrate that the\nevaluated LLMs begin to approach performance of state-of-the-art models in\nzero- and few-shot scenarios for most tasks, and particularly well for the QA\ntask, even though they have never seen examples from these tasks before.\nHowever, we observed that the classification and RE tasks perform below what\ncan be achieved with a specifically trained model for the medical field, such\nas PubMedBERT. Finally, we noted that no LLM outperforms all the others on all\nthe studied tasks, with some models being better suited for certain tasks than\nothers.\n","authors":["Yanis Labrak","Mickael Rouvier","Richard Dufour"],"pdf_url":"https://arxiv.org/pdf/2307.12114v1.pdf","comment":"Under review process"},{"id":"http://arxiv.org/abs/2303.13379v2","updated":"2023-07-22T15:26:28Z","published":"2023-03-17T18:14:46Z","title":"Practical and Ethical Challenges of Large Language Models in Education:\n A Systematic Scoping Review","summary":" Educational technology innovations leveraging large language models (LLMs)\nhave shown the potential to automate the laborious process of generating and\nanalysing textual content. While various innovations have been developed to\nautomate a range of educational tasks (e.g., question generation, feedback\nprovision, and essay grading), there are concerns regarding the practicality\nand ethicality of these innovations. Such concerns may hinder future research\nand the adoption of LLMs-based innovations in authentic educational contexts.\nTo address this, we conducted a systematic scoping review of 118 peer-reviewed\npapers published since 2017 to pinpoint the current state of research on using\nLLMs to automate and support educational tasks. The findings revealed 53 use\ncases for LLMs in automating education tasks, categorised into nine main\ncategories: profiling/labelling, detection, grading, teaching support,\nprediction, knowledge representation, feedback, content generation, and\nrecommendation. Additionally, we also identified several practical and ethical\nchallenges, including low technological readiness, lack of replicability and\ntransparency, and insufficient privacy and beneficence considerations. The\nfindings were summarised into three recommendations for future studies,\nincluding updating existing innovations with state-of-the-art models (e.g.,\nGPT-3/4), embracing the initiative of open-sourcing models/systems, and\nadopting a human-centred approach throughout the developmental process. As the\nintersection of AI and education is continuously evolving, the findings of this\nstudy can serve as an essential reference point for researchers, allowing them\nto leverage the strengths, learn from the limitations, and uncover potential\nresearch opportunities enabled by ChatGPT and other generative AI models.\n","authors":["Lixiang Yan","Lele Sha","Linxuan Zhao","Yuheng Li","Roberto Martinez-Maldonado","Guanliang Chen","Xinyu Li","Yueqiao Jin","Dragan Gašević"],"pdf_url":"https://arxiv.org/pdf/2303.13379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10025v2","updated":"2023-07-22T15:07:57Z","published":"2023-07-19T15:09:50Z","title":"An Empirical Study on Fertility Proposals Using Multi-Grained Topic\n Analysis Methods","summary":" Fertility issues are closely related to population security, in 60 years\nChina's population for the first time in a negative growth trend, the change of\nfertility policy is of great concern to the community. 2023 \"two sessions\"\nproposal \"suggests that the country in the form of legislation, the birth of\nthe registration of the cancellation of the marriage restriction\" This topic\nwas once a hot topic on the Internet, and \"unbundling\" the relationship between\nbirth registration and marriage has become the focus of social debate. In this\npaper, we adopt co-occurrence semantic analysis, topic analysis and sentiment\nanalysis to conduct multi-granularity semantic analysis of microblog comments.\nIt is found that the discussion on the proposal of \"removing marriage\nrestrictions from birth registration\" involves the individual, society and the\nstate at three dimensions, and is detailed into social issues such as personal\nbehaviour, social ethics and law, and national policy, with people's sentiment\ninclined to be negative in most of the topics. Based on this, eight proposals\nwere made to provide a reference for governmental decision making and to form a\nreference method for researching public opinion on political issues.\n","authors":["Yulin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.10025v2.pdf","comment":"7 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2212.05767v7","updated":"2023-07-22T13:05:21Z","published":"2022-12-12T08:40:04Z","title":"A Survey of Knowledge Graph Reasoning on Graph Types: Static, Dynamic,\n and Multimodal","summary":" Knowledge graph reasoning (KGR), aiming to deduce new facts from existing\nfacts based on mined logic rules underlying knowledge graphs (KGs), has become\na fast-growing research direction. It has been proven to significantly benefit\nthe usage of KGs in many AI applications, such as question answering,\nrecommendation systems, and etc. According to the graph types, existing KGR\nmodels can be roughly divided into three categories, i.e., static models,\ntemporal models, and multi-modal models. Early works in this domain mainly\nfocus on static KGR, and recent works try to leverage the temporal and\nmulti-modal information, which are more practical and closer to real-world.\nHowever, no survey papers and open-source repositories comprehensively\nsummarize and discuss models in this important direction. To fill the gap, we\nconduct a first survey for knowledge graph reasoning tracing from static to\ntemporal and then to multi-modal KGs. Concretely, the models are reviewed based\non bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques\nand scenarios). Besides, the performances, as well as datasets, are summarized\nand presented. Moreover, we point out the challenges and potential\nopportunities to enlighten the readers. The corresponding open-source\nrepository is shared on GitHub\nhttps://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning.\n","authors":["Ke Liang","Lingyuan Meng","Meng Liu","Yue Liu","Wenxuan Tu","Siwei Wang","Sihang Zhou","Xinwang Liu","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2212.05767v7.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2306.17727v2","updated":"2023-07-22T12:41:04Z","published":"2023-06-30T15:16:52Z","title":"Improved NL2SQL based on Multi-layer Expert Network","summary":" The Natural Language to SQL (NL2SQL) technique is used to convert natural\nlanguage queries into executable SQL statements. Typically, slot-filling is\nemployed as a classification method for multi-task cases to achieve this goal.\nHowever, slot-filling can result in inaccurate SQL statement generation due to\nnegative migration issues arising from different classification tasks. To\novercome this limitation, this study introduces a new approach called\nMulti-Layer Expert Generate SQL (MLEG-SQL), which utilizes a dedicated\nmulti-task hierarchical network. The lower layer of the network extracts\nsemantic features of natural language statements, while the upper layer builds\na specialized expert system for handling specific classification tasks. This\nhierarchical approach mitigates performance degradation resulting from\ndifferent task conflicts. The proposed method was evaluated on the WiKSQL\ndataset and was found to be effective in generating accurate SQL statements.\n","authors":["Chenduo Hao","Xu Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.17727v2.pdf","comment":"the paper's figure has something wrong"},{"id":"http://arxiv.org/abs/2307.12045v1","updated":"2023-07-22T10:35:25Z","published":"2023-07-22T10:35:25Z","title":"Revisiting Distillation for Continual Learning on Visual Question\n Localized-Answering in Robotic Surgery","summary":" The visual-question localized-answering (VQLA) system can serve as a\nknowledgeable assistant in surgical education. Except for providing text-based\nanswers, the VQLA system can highlight the interested region for better\nsurgical scene understanding. However, deep neural networks (DNNs) suffer from\ncatastrophic forgetting when learning new knowledge. Specifically, when DNNs\nlearn on incremental classes or tasks, their performance on old tasks drops\ndramatically. Furthermore, due to medical data privacy and licensing issues, it\nis often difficult to access old data when updating continual learning (CL)\nmodels. Therefore, we develop a non-exemplar continual surgical VQLA framework,\nto explore and balance the rigidity-plasticity trade-off of DNNs in a\nsequential learning paradigm. We revisit the distillation loss in CL tasks, and\npropose rigidity-plasticity-aware distillation (RP-Dist) and self-calibrated\nheterogeneous distillation (SH-Dist) to preserve the old knowledge. The weight\naligning (WA) technique is also integrated to adjust the weight bias between\nold and new tasks. We further establish a CL framework on three public surgical\ndatasets in the context of surgical settings that consist of overlapping\nclasses between old and new surgical VQLA tasks. With extensive experiments, we\ndemonstrate that our proposed method excellently reconciles learning and\nforgetting on the continual surgical VQLA over conventional CL methods. Our\ncode is publicly accessible.\n","authors":["Long Bai","Mobarakol Islam","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2307.12045v1.pdf","comment":"To appear in MICCAI 2023. Code availability:\n https://github.com/longbai1006/CS-VQLA"},{"id":"http://arxiv.org/abs/2307.10443v2","updated":"2023-07-22T08:36:18Z","published":"2023-07-19T20:17:37Z","title":"Integrating a Heterogeneous Graph with Entity-aware Self-attention using\n Relative Position Labels for Reading Comprehension Model","summary":" Despite the significant progress made by transformer models in machine\nreading comprehension tasks, they still fall short in handling complex\nreasoning tasks due to the absence of explicit knowledge in the input sequence.\nTo address this limitation, many recent works have proposed injecting external\nknowledge into the model. However, selecting relevant external knowledge,\nensuring its availability, and requiring additional processing steps remain\nchallenging. In this paper, we introduce a novel attention pattern that\nintegrates reasoning knowledge derived from a heterogeneous graph into the\ntransformer architecture without relying on external knowledge. The proposed\nattention pattern comprises three key elements: global-local attention for word\ntokens, graph attention for entity tokens that exhibit strong attention towards\ntokens connected in the graph as opposed to those unconnected, and the\nconsideration of the type of relationship between each entity token and word\ntoken. This results in optimized attention between the two if a relationship\nexists. The pattern is coupled with special relative position labels, allowing\nit to integrate with LUKE's entity-aware self-attention mechanism. The\nexperimental findings corroborate that our model outperforms both the\ncutting-edge LUKE-Graph and the baseline LUKE model on the ReCoRD dataset that\nfocuses on commonsense reasoning.\n","authors":["Shima Foolad","Kourosh Kiani"],"pdf_url":"https://arxiv.org/pdf/2307.10443v2.pdf","comment":"submitted for Knowledge-Based Systems Journal"},{"id":"http://arxiv.org/abs/2307.07851v2","updated":"2023-07-22T07:39:59Z","published":"2023-07-15T17:01:56Z","title":"AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual\n Similarity using Contrastive Learning and Structured Knowledge","summary":" Generic sentence embeddings provide a coarse-grained approximation of\nsemantic textual similarity but ignore specific aspects that make texts\nsimilar. Conversely, aspect-based sentence embeddings provide similarities\nbetween texts based on certain predefined aspects. Thus, similarity predictions\nof texts are more targeted to specific requirements and more easily\nexplainable. In this paper, we present AspectCSE, an approach for aspect-based\ncontrastive learning of sentence embeddings. Results indicate that AspectCSE\nachieves an average improvement of 3.97% on information retrieval tasks across\nmultiple aspects compared to the previous best results. We also propose using\nWikidata knowledge graph properties to train models of multi-aspect sentence\nembeddings in which multiple specific aspects are simultaneously considered\nduring similarity predictions. We demonstrate that multi-aspect embeddings\noutperform single-aspect embeddings on aspect-specific information retrieval\ntasks. Finally, we examine the aspect-based sentence embedding space and\ndemonstrate that embeddings of semantically similar aspect labels are often\nclose, even without explicit similarity training between different aspect\nlabels.\n","authors":["Tim Schopf","Emanuel Gerber","Malte Ostendorff","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.07851v2.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2307.03104v2","updated":"2023-07-22T07:20:00Z","published":"2023-07-06T16:26:34Z","title":"Efficient Domain Adaptation of Sentence Embeddings using Adapters","summary":" Sentence embeddings enable us to capture the semantic similarity of short\ntexts. Most sentence embedding models are trained for general semantic textual\nsimilarity (STS) tasks. Therefore, to use sentence embeddings in a particular\ndomain, the model must be adapted to it in order to achieve good results.\nUsually, this is done by fine-tuning the entire sentence embedding model for\nthe domain of interest. While this approach yields state-of-the-art results,\nall of the model's weights are updated during fine-tuning, making this method\nresource-intensive. Therefore, instead of fine-tuning entire sentence embedding\nmodels for each target domain individually, we propose to train lightweight\nadapters. These domain-specific adapters do not require fine-tuning all\nunderlying sentence embedding model parameters. Instead, we only train a small\nnumber of additional parameters while keeping the weights of the underlying\nsentence embedding model fixed. Training domain-specific adapters allows always\nusing the same base model and only exchanging the domain-specific adapters to\nadapt sentence embeddings to a specific domain. We show that using adapters for\nparameter-efficient domain adaptation of sentence embeddings yields competitive\nperformance within 1% of a domain-adapted, entirely fine-tuned sentence\nembedding model while only training approximately 3.6% of the parameters.\n","authors":["Tim Schopf","Dennis Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.03104v2.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2307.11991v1","updated":"2023-07-22T06:21:41Z","published":"2023-07-22T06:21:41Z","title":"Psy-LLM: Scaling up Global Mental Health Psychological Services with\n AI-based Large Language Models","summary":" The demand for psychological counseling has grown significantly in recent\nyears, particularly with the global outbreak of COVID-19, which has heightened\nthe need for timely and professional mental health support. Online\npsychological counseling has emerged as the predominant mode of providing\nservices in response to this demand. In this study, we propose the Psy-LLM\nframework, an AI-based system leveraging Large Language Models (LLMs) for\nquestion-answering in online psychological consultation. Our framework combines\npre-trained LLMs with real-world professional Q&A from psychologists and\nextensively crawled psychological articles. The Psy-LLM framework serves as a\nfront-end tool for healthcare professionals, allowing them to provide immediate\nresponses and mindfulness activities to alleviate patient stress. Additionally,\nit functions as a screening tool to identify urgent cases requiring further\nassistance. We evaluated the framework using intrinsic metrics, such as\nperplexity, and extrinsic evaluation metrics, with human participant\nassessments of response helpfulness, fluency, relevance, and logic. The results\ndemonstrate the effectiveness of the Psy-LLM framework in generating coherent\nand relevant answers to psychological questions. This article concludes by\ndiscussing the potential of large language models to enhance mental health\nsupport through AI technologies in online psychological consultation.\n","authors":["Tin Lai","Yukun Shi","Zicong Du","Jiajie Wu","Ken Fu","Yichao Dou","Ziqi Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11984v1","updated":"2023-07-22T05:26:50Z","published":"2023-07-22T05:26:50Z","title":"Learning Vision-and-Language Navigation from YouTube Videos","summary":" Vision-and-language navigation (VLN) requires an embodied agent to navigate\nin realistic 3D environments using natural language instructions. Existing VLN\nmethods suffer from training on small-scale environments or unreasonable\npath-instruction datasets, limiting the generalization to unseen environments.\nThere are massive house tour videos on YouTube, providing abundant real\nnavigation experiences and layout information. However, these videos have not\nbeen explored for VLN before. In this paper, we propose to learn an agent from\nthese videos by creating a large-scale dataset which comprises reasonable\npath-instruction pairs from house tour videos and pre-training the agent on it.\nTo achieve this, we have to tackle the challenges of automatically constructing\npath-instruction pairs and exploiting real layout knowledge from raw and\nunlabeled videos. To address these, we first leverage an entropy-based method\nto construct the nodes of a path trajectory. Then, we propose an action-aware\ngenerator for generating instructions from unlabeled trajectories. Last, we\ndevise a trajectory judgment pretext task to encourage the agent to mine the\nlayout knowledge. Experimental results show that our method achieves\nstate-of-the-art performance on two popular benchmarks (R2R and REVERIE). Code\nis available at https://github.com/JeremyLinky/YouTube-VLN\n","authors":["Kunyang Lin","Peihao Chen","Diwei Huang","Thomas H. Li","Mingkui Tan","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.11984v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09998v2","updated":"2023-07-22T04:03:35Z","published":"2023-07-19T14:13:02Z","title":"Generating Mathematical Derivations with Large Language Models","summary":" The derivation of mathematical results in specialised fields using Large\nLanguage Models (LLMs) is an emerging research direction that can help identify\nmodels' limitations, and potentially support mathematical discovery. In this\npaper, we leverage a symbolic engine to generate derivations of equations at\nscale, and investigate the capabilities of LLMs when deriving goal equations\nfrom premises. Specifically, we employ in-context learning for GPT and\nfine-tune a range of T5 models to compare the robustness and generalisation of\npre-training strategies to specialised models. Empirical results show that\nfine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and\nout-of-distribution test sets in terms of absolute performance. However, an\nin-depth analysis reveals that the fine-tuned models are more sensitive to\nperturbations involving unseen symbols and (to a lesser extent) changes to\nequation structure. In addition, we analyse 1.7K equations and over 200\nderivations to highlight common reasoning errors such as the inclusion of\nincorrect, irrelevant, and redundant equations, along with the tendency to skip\nderivation steps. Finally, we explore the suitability of existing metrics for\nevaluating mathematical derivations finding evidence that, while they capture\ngeneral properties such as sensitivity to perturbations, they fail to highlight\nfine-grained reasoning errors and essential differences between models.\nOverall, this work demonstrates that training models on synthetic data can\nimprove their mathematical capabilities beyond larger architectures.\n","authors":["Jordan Meadows","Marco Valentino","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2307.09998v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2302.00102v2","updated":"2023-07-22T02:21:10Z","published":"2023-01-31T21:08:58Z","title":"Detecting Harmful Agendas in News Articles","summary":" Manipulated news online is a growing problem which necessitates the use of\nautomated systems to curtail its spread. We argue that while misinformation and\ndisinformation detection have been studied, there has been a lack of investment\nin the important open challenge of detecting harmful agendas in news articles;\nidentifying harmful agendas is critical to flag news campaigns with the\ngreatest potential for real world harm. Moreover, due to real concerns around\ncensorship, harmful agenda detectors must be interpretable to be effective. In\nthis work, we propose this new task and release a dataset, NewsAgendas, of\nannotated news articles for agenda identification. We show how interpretable\nsystems can be effective on this task and demonstrate that they can perform\ncomparably to black-box models.\n","authors":["Melanie Subbiah","Amrita Bhattacharjee","Yilun Hua","Tharindu Kumarage","Huan Liu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2302.00102v2.pdf","comment":"Camera-ready for ACL-WASSA 2023"},{"id":"http://arxiv.org/abs/2307.08074v2","updated":"2023-07-22T00:11:24Z","published":"2023-07-16T15:18:25Z","title":"Disco-Bench: A Discourse-Aware Evaluation Benchmark for Language\n Modelling","summary":" Modeling discourse -- the linguistic phenomena that go beyond individual\nsentences, is a fundamental yet challenging aspect of natural language\nprocessing (NLP). However, existing evaluation benchmarks primarily focus on\nthe evaluation of inter-sentence properties and overlook critical discourse\nphenomena that cross sentences. To bridge the gap, we propose Disco-Bench, a\nbenchmark that can evaluate intra-sentence discourse properties across a\ndiverse set of NLP tasks, covering understanding, translation, and generation.\nDisco-Bench consists of 9 document-level testsets in the literature domain,\nwhich contain rich discourse phenomena (e.g. cohesion and coherence) in Chinese\nand/or English. For linguistic analysis, we also design a diagnostic test suite\nthat can examine whether the target models learn discourse knowledge. We\ntotally evaluate 20 general-, in-domain and commercial models based on\nTransformer, advanced pretraining architectures and large language models\n(LLMs). Our results show (1) the challenge and necessity of our evaluation\nbenchmark; (2) fine-grained pretraining based on literary document-level\ntraining data consistently improves the modeling of discourse information. We\nwill release the datasets, pretrained models, and leaderboard, which we hope\ncan significantly facilitate research in this field:\nhttps://github.com/longyuewangdcu/Disco-Bench.\n","authors":["Longyue Wang","Zefeng Du","Donghuai Liu","Deng Cai","Dian Yu","Haiyun Jiang","Yan Wang","Leyang Cui","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2307.08074v2.pdf","comment":"Zhaopeng Tu is the corresponding author"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2212.05767v7","updated":"2023-07-22T13:05:21Z","published":"2022-12-12T08:40:04Z","title":"A Survey of Knowledge Graph Reasoning on Graph Types: Static, Dynamic,\n and Multimodal","summary":" Knowledge graph reasoning (KGR), aiming to deduce new facts from existing\nfacts based on mined logic rules underlying knowledge graphs (KGs), has become\na fast-growing research direction. It has been proven to significantly benefit\nthe usage of KGs in many AI applications, such as question answering,\nrecommendation systems, and etc. According to the graph types, existing KGR\nmodels can be roughly divided into three categories, i.e., static models,\ntemporal models, and multi-modal models. Early works in this domain mainly\nfocus on static KGR, and recent works try to leverage the temporal and\nmulti-modal information, which are more practical and closer to real-world.\nHowever, no survey papers and open-source repositories comprehensively\nsummarize and discuss models in this important direction. To fill the gap, we\nconduct a first survey for knowledge graph reasoning tracing from static to\ntemporal and then to multi-modal KGs. Concretely, the models are reviewed based\non bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques\nand scenarios). Besides, the performances, as well as datasets, are summarized\nand presented. Moreover, we point out the challenges and potential\nopportunities to enlighten the readers. The corresponding open-source\nrepository is shared on GitHub\nhttps://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning.\n","authors":["Ke Liang","Lingyuan Meng","Meng Liu","Yue Liu","Wenxuan Tu","Siwei Wang","Sihang Zhou","Xinwang Liu","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2212.05767v7.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2307.12034v1","updated":"2023-07-22T10:03:32Z","published":"2023-07-22T10:03:32Z","title":"Conformal Group Recommender System","summary":" Group recommender systems (GRS) are critical in discovering relevant items\nfrom a near-infinite inventory based on group preferences rather than\nindividual preferences, like recommending a movie, restaurant, or tourist\ndestination to a group of individuals. The traditional models of group\nrecommendation are designed to act like a black box with a strict focus on\nimproving recommendation accuracy, and most often, they place the onus on the\nusers to interpret recommendations. In recent years, the focus of Recommender\nSystems (RS) research has shifted away from merely improving recommendation\naccuracy towards value additions such as confidence and explanation. In this\nwork, we propose a conformal prediction framework that provides a measure of\nconfidence with prediction in conjunction with a group recommender system to\naugment the system-generated plain recommendations. In the context of group\nrecommender systems, we propose various nonconformity measures that play a\nvital role in the efficiency of the conformal framework. We also show that\ndefined nonconformity satisfies the exchangeability property. Experimental\nresults demonstrate the effectiveness of the proposed approach over several\nbenchmark datasets. Furthermore, our proposed approach also satisfies validity\nand efficiency properties.\n","authors":["Venkateswara Rao Kagita","Anshuman Singh","Vikas Kumar","Pavan Kalyan Reddy Neerudu","Arun K Pujari","Rohit Kumar Bondugula"],"pdf_url":"https://arxiv.org/pdf/2307.12034v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2307.12019v1","updated":"2023-07-22T08:38:14Z","published":"2023-07-22T08:38:14Z","title":"XWalk: Random Walk Based Candidate Retrieval for Product Search","summary":" In e-commerce, head queries account for the vast majority of gross\nmerchandise sales and improvements to head queries are highly impactful to the\nbusiness. While most supervised approaches to search perform better in head\nqueries vs. tail queries, we propose a method that further improves head query\nperformance dramatically. We propose XWalk, a random-walk based graph approach\nto candidate retrieval for product search that borrows from recommendation\nsystem techniques. XWalk is highly efficient to train and inference in a\nlarge-scale high traffic e-commerce setting, and shows substantial improvements\nin head query performance over state-of-the-art neural retreivers. Ensembling\nXWalk with a neural and/or lexical retriever combines the best of both worlds\nand the resulting retrieval system outperforms all other methods in both\noffline relevance-based evaluation and in online A/B tests.\n","authors":["Jon Eskreis-Winkler","Yubin Kim","Andrew Stanton"],"pdf_url":"https://arxiv.org/pdf/2307.12019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07145v4","updated":"2023-07-22T07:54:02Z","published":"2023-04-14T14:13:37Z","title":"EvalRS 2023. Well-Rounded Recommender Systems For Real-World Deployments","summary":" EvalRS aims to bring together practitioners from industry and academia to\nfoster a debate on rounded evaluation of recommender systems, with a focus on\nreal-world impact across a multitude of deployment scenarios. Recommender\nsystems are often evaluated only through accuracy metrics, which fall short of\nfully characterizing their generalization capabilities and miss important\naspects, such as fairness, bias, usefulness, informativeness. This workshop\nbuilds on the success of last year's workshop at CIKM, but with a broader scope\nand an interactive format.\n","authors":["Federico Bianchi","Patrick John Chia","Ciro Greco","Claudio Pomo","Gabriel Moreira","Davide Eynard","Fahd Husain","Jacopo Tagliabue"],"pdf_url":"https://arxiv.org/pdf/2304.07145v4.pdf","comment":"EvalRS 2023 is a workshop at KDD23. Code and hackathon materials:\n https://github.com/RecList/evalRS-KDD-2023"},{"id":"http://arxiv.org/abs/2307.11994v1","updated":"2023-07-22T06:44:28Z","published":"2023-07-22T06:44:28Z","title":"HTP: Exploiting Holistic Temporal Patterns for Sequential Recommendation","summary":" Sequential recommender systems have demonstrated a huge success for next-item\nrecommendation by explicitly exploiting the temporal order of users' historical\ninteractions. In practice, user interactions contain more useful temporal\ninformation beyond order, as shown by some pioneering studies. In this paper,\nwe systematically investigate various temporal information for sequential\nrecommendation and identify three types of advantageous temporal patterns\nbeyond order, including absolute time information, relative item time intervals\nand relative recommendation time intervals. We are the first to explore\nitem-oriented absolute time patterns. While existing models consider only one\nor two of these three patterns, we propose a novel holistic temporal pattern\nbased neural network, named HTP, to fully leverage all these three patterns. In\nparticular, we introduce novel components to address the subtle correlations\nbetween relative item time intervals and relative recommendation time\nintervals, which render a major technical challenge. Extensive experiments on\nthree real-world benchmark datasets show that our HTP model consistently and\nsubstantially outperforms many state-of-the-art models. Our code is publically\navailable at https://github.com/623851394/HTP/tree/main/HTP-main\n","authors":["Chen Rui","Liang Guotao","Ma Chenrui","Han Qilong","Li Li","Huang Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11981v1","updated":"2023-07-22T04:52:27Z","published":"2023-07-22T04:52:27Z","title":"Collaborative Graph Neural Networks for Attributed Network Embedding","summary":" Graph neural networks (GNNs) have shown prominent performance on attributed\nnetwork embedding. However, existing efforts mainly focus on exploiting network\nstructures, while the exploitation of node attributes is rather limited as they\nonly serve as node features at the initial layer. This simple strategy impedes\nthe potential of node attributes in augmenting node connections, leading to\nlimited receptive field for inactive nodes with few or even no neighbors.\nFurthermore, the training objectives (i.e., reconstructing network structures)\nof most GNNs also do not include node attributes, although studies have shown\nthat reconstructing node attributes is beneficial. Thus, it is encouraging to\ndeeply involve node attributes in the key components of GNNs, including graph\nconvolution operations and training objectives. However, this is a nontrivial\ntask since an appropriate way of integration is required to maintain the merits\nof GNNs. To bridge the gap, in this paper, we propose COllaborative graph\nNeural Networks--CONN, a tailored GNN architecture for attribute network\nembedding. It improves model capacity by 1) selectively diffusing messages from\nneighboring nodes and involved attribute categories, and 2) jointly\nreconstructing node-to-node and node-to-attribute-category interactions via\ncross-correlation. Experiments on real-world networks demonstrate that CONN\nexcels state-of-the-art embedding algorithms with a great margin.\n","authors":["Qiaoyu Tan","Xin Zhang","Xiao Huang","Hao Chen","Jundong Li","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2307.11981v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.12152v1","updated":"2023-07-22T19:52:04Z","published":"2023-07-22T19:52:04Z","title":"Real-Time Neural Video Recovery and Enhancement on Mobile Devices","summary":" As mobile devices become increasingly popular for video streaming, it's\ncrucial to optimize the streaming experience for these devices. Although deep\nlearning-based video enhancement techniques are gaining attention, most of them\ncannot support real-time enhancement on mobile devices. Additionally, many of\nthese techniques are focused solely on super-resolution and cannot handle\npartial or complete loss or corruption of video frames, which is common on the\nInternet and wireless networks.\n To overcome these challenges, we present a novel approach in this paper. Our\napproach consists of (i) a novel video frame recovery scheme, (ii) a new\nsuper-resolution algorithm, and (iii) a receiver enhancement-aware video bit\nrate adaptation algorithm. We have implemented our approach on an iPhone 12,\nand it can support 30 frames per second (FPS). We have evaluated our approach\nin various networks such as WiFi, 3G, 4G, and 5G networks. Our evaluation shows\nthat our approach enables real-time enhancement and results in a significant\nincrease in video QoE (Quality of Experience) of 24\\% - 82\\% in our video\nstreaming system.\n","authors":["Zhaoyuan He","Yifan Yang","Lili Qiu","Kyoungjun Park"],"pdf_url":"https://arxiv.org/pdf/2307.12152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07847v2","updated":"2023-07-22T19:51:27Z","published":"2023-07-15T16:45:01Z","title":"Neural Video Recovery for Cloud Gaming","summary":" Cloud gaming is a multi-billion dollar industry. A client in cloud gaming\nsends its movement to the game server on the Internet, which renders and\ntransmits the resulting video back. In order to provide a good gaming\nexperience, a latency below 80 ms is required. This means that video rendering,\nencoding, transmission, decoding, and display have to finish within that time\nframe, which is especially challenging to achieve due to server overload,\nnetwork congestion, and losses. In this paper, we propose a new method for\nrecovering lost or corrupted video frames in cloud gaming. Unlike traditional\nvideo frame recovery, our approach uses game states to significantly enhance\nrecovery accuracy and utilizes partially decoded frames to recover lost\nportions. We develop a holistic system that consists of (i) efficiently\nextracting game states, (ii) modifying H.264 video decoder to generate a mask\nto indicate which portions of video frames need recovery, and (iii) designing a\nnovel neural network to recover either complete or partial video frames. Our\napproach is extensively evaluated using iPhone 12 and laptop implementations,\nand we demonstrate the utility of game states in the game video recovery and\nthe effectiveness of our overall design.\n","authors":["Zhaoyuan He","Yifan Yang","Shuozhe Li","Diyuan Dai","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2307.07847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16181v2","updated":"2023-07-22T13:59:13Z","published":"2023-06-28T13:03:43Z","title":"Learning to Pan-sharpening with Memories of Spatial Details","summary":" Pan-sharpening, as one of the most commonly used techniques in remote sensing\nsystems, aims to inject spatial details from panchromatic images into\nmultispectral images (MS) to obtain high-resolution multispectral images. Since\ndeep learning has received widespread attention because of its powerful fitting\nability and efficient feature extraction, a variety of pan-sharpening methods\nhave been proposed to achieve remarkable performance. However, current\npan-sharpening methods usually require the paired panchromatic (PAN) and MS\nimages as input, which limits their usage in some scenarios. To address this\nissue, in this paper we observe that the spatial details from PAN images are\nmainly high-frequency cues, i.e., the edges reflect the contour of input PAN\nimages. This motivates us to develop a PAN-agnostic representation to store\nsome base edges, so as to compose the contour for the corresponding PAN image\nvia them. As a result, we can perform the pan-sharpening task with only the MS\nimage when inference. To this end, a memory-based network is adapted to extract\nand memorize the spatial details during the training phase and is used to\nreplace the process of obtaining spatial information from PAN images when\ninference, which is called Memory-based Spatial Details Network (MSDN).\nFinally, we integrate the proposed MSDN module into the existing deep\nlearning-based pan-sharpening methods to achieve an end-to-end pan-sharpening\nnetwork. With extensive experiments on the Gaofen1 and WorldView-4 satellites,\nwe verify that our method constructs good spatial details without PAN images\nand achieves the best performance. The code is available at\nhttps://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.\n","authors":["Maoxun Yuan","Tianyi Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2306.16181v2.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..1033a251 --- /dev/null +++ b/index.html @@ -0,0 +1,77876 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 38 + +
+
+
+ + ☆ 3D-LLM: Injecting the 3D World into Large Language Models + + +
+ Large language models (LLMs) and Vision-Language Models (VLMs) have been +proven to excel at multiple tasks, such as commonsense reasoning. Powerful as +these models can be, they are not grounded in the 3D physical world, which +involves richer concepts such as spatial relationships, affordances, physics, +layout, and so on. In this work, we propose to inject the 3D world into large +language models and introduce a whole new family of 3D-LLMs. Specifically, +3D-LLMs can take 3D point clouds and their features as input and perform a +diverse set of 3D-related tasks, including captioning, dense captioning, 3D +question answering, task decomposition, 3D grounding, 3D-assisted dialog, +navigation, and so on. Using three types of prompting mechanisms that we +design, we are able to collect over 300k 3D-language data covering these tasks. +To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that +obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as +our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism, +3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show +that our model outperforms state-of-the-art baselines by a large margin (e.g., +the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore, +experiments on our held-in datasets for 3D captioning, task composition, and +3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative +examples also show that our model could perform more tasks beyond the scope of +existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/. + +
+
+ comment: Project Page: : https://vis-www.cs.umass.edu/3dllm/ +
+
+
+
+
+ + ☆ Evaluating the Ripple Effects of Knowledge Editing in Language Models + + +
+ Modern language models capture a large body of factual knowledge. However, +some facts can be incorrectly induced or become obsolete over time, resulting +in factually incorrect generations. This has led to the development of various +editing methods that allow updating facts encoded by the model. Evaluation of +these methods has primarily focused on testing whether an individual fact has +been successfully injected, and if similar predictions for other subjects have +not changed. Here we argue that such evaluation is limited, since injecting one +fact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple +effect'' in the form of additional facts that the model needs to update +(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we +propose a novel set of evaluation criteria that consider the implications of an +edit on related facts. Using these criteria, we then construct \ripple{}, a +diagnostic benchmark of 5K factual edits, capturing a variety of types of +ripple effects. We evaluate prominent editing methods on \ripple{}, showing +that current methods fail to introduce consistent changes in the model's +knowledge. In addition, we find that a simple in-context editing baseline +obtains the best scores on our benchmark, suggesting a promising research +direction for model editing. + +
+
+
+
+
+ + ☆ Leveraging Label Variation in Large Language Models for Zero-Shot Text + Classification + + +
+ The zero-shot learning capabilities of large language models (LLMs) make them +ideal for text classification without annotation or supervised training. Many +studies have shown impressive results across multiple tasks. While tasks, data, +and results differ widely, their similarities to human annotation can aid us in +tackling new tasks with minimal expenses. We evaluate using 5 state-of-the-art +LLMs as "annotators" on 5 different tasks (age, gender, topic, sentiment +prediction, and hate speech detection), across 4 languages: English, French, +German, and Spanish. No single model excels at all tasks, across languages, or +across all labels within a task. However, aggregation techniques designed for +human annotators perform substantially better than any one individual model. +Overall, though, LLMs do not rival even simple supervised models, so they do +not (yet) replace the need for human annotation. We also discuss the tradeoffs +between speed, accuracy, cost, and bias when it comes to aggregated model +labeling versus human annotation. + +
+
+
+
+
+ + ☆ Aligning Large Language Models with Human: A Survey + + +
+ Large Language Models (LLMs) trained on extensive textual corpora have +emerged as leading solutions for a broad array of Natural Language Processing +(NLP) tasks. Despite their notable performance, these models are prone to +certain limitations such as misunderstanding human instructions, generating +potentially biased content, or factually incorrect (hallucinated) information. +Hence, aligning LLMs with human expectations has become an active area of +interest within the research community. This survey presents a comprehensive +overview of these alignment technologies, including the following aspects. (1) +Data collection: the methods for effectively collecting high-quality +instructions for LLM alignment, including the use of NLP benchmarks, human +annotations, and leveraging strong LLMs. (2) Training methodologies: a detailed +review of the prevailing training methods employed for LLM alignment. Our +exploration encompasses Supervised Fine-tuning, both Online and Offline human +preference training, along with parameter-efficient training mechanisms. (3) +Model Evaluation: the methods for evaluating the effectiveness of these +human-aligned LLMs, presenting a multifaceted approach towards their +assessment. In conclusion, we collate and distill our findings, shedding light +on several promising future research avenues in the field. This survey, +therefore, serves as a valuable resource for anyone invested in understanding +and advancing the alignment of LLMs to better suit human-oriented tasks and +expectations. An associated GitHub link collecting the latest papers is +available at https://github.com/GaryYufei/AlignLLMHumanSurvey. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ RLCD: Reinforcement Learning from Contrast Distillation for Language + Model Alignment + + +
+ We propose Reinforcement Learning from Contrast Distillation (RLCD), a method +for aligning language models to follow natural language principles without +using human feedback. RLCD trains a preference model using simulated preference +pairs that contain both a high-quality and low-quality example, generated using +contrasting positive and negative prompts. The preference model is then used to +improve a base unaligned language model via reinforcement learning. +Empirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context +distillation (Huang et al., 2022) baselines across three diverse alignment +tasks--harmlessness, helpfulness, and story outline generation--and on both 7B +and 30B model scales for preference data simulation. + +
+
+
+
+
+ + ☆ Boosting Punctuation Restoration with Data Generation and Reinforcement + Learning INTERSPEECH 2023 + + +
+ Punctuation restoration is an important task in automatic speech recognition +(ASR) which aim to restore the syntactic structure of generated ASR texts to +improve readability. While punctuated texts are abundant from written +documents, the discrepancy between written punctuated texts and ASR texts +limits the usability of written texts in training punctuation restoration +systems for ASR texts. This paper proposes a reinforcement learning method to +exploit in-topic written texts and recent advances in large pre-trained +generative language models to bridge this gap. The experiments show that our +method achieves state-of-the-art performance on the ASR test set on two +benchmark datasets for punctuation restoration. + +
+
+ comment: Accepted at INTERSPEECH 2023, 6 pages +
+
+
+
+
+ + ☆ Rule By Example: Harnessing Logical Rules for Explainable Hate Speech + Detection ACL 2023 + + +
+ Classic approaches to content moderation typically apply a rule-based +heuristic approach to flag content. While rules are easily customizable and +intuitive for humans to interpret, they are inherently fragile and lack the +flexibility or robustness needed to moderate the vast amount of undesirable +content found online today. Recent advances in deep learning have demonstrated +the promise of using highly effective deep neural models to overcome these +challenges. However, despite the improved performance, these data-driven models +lack transparency and explainability, often leading to mistrust from everyday +users and a lack of adoption by many platforms. In this paper, we present Rule +By Example (RBE): a novel exemplar-based contrastive learning approach for +learning from logical rules for the task of textual content moderation. RBE is +capable of providing rule-grounded predictions, allowing for more explainable +and customizable predictions compared to typical deep learning-based +approaches. We demonstrate that our approach is capable of learning rich rule +embedding representations using only a few data examples. Experimental results +on 3 popular hate speech classification datasets show that RBE is able to +outperform state-of-the-art deep learning classifiers as well as the use of +rules in both supervised and unsupervised settings while providing explainable +model predictions via rule-grounding. + +
+
+ comment: ACL 2023 Main Conference +
+
+
+
+
+ + ☆ Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models + + +
+ The article introduces corrections to Zipf's and Heaps' laws based on +systematic models of the hapax rate. The derivation rests on two assumptions: +The first one is the standard urn model which predicts that marginal frequency +distributions for shorter texts look as if word tokens were sampled blindly +from a given longer text. The second assumption posits that the rate of hapaxes +is a simple function of the text size. Four such functions are discussed: the +constant model, the Davis model, the linear model, and the logistic model. It +is shown that the logistic model yields the best fit. + +
+
+ comment: 41 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ A Real-World WebAgent with Planning, Long Context Understanding, and + Program Synthesis + + +
+ Pre-trained large language models (LLMs) have recently achieved better +generalization and sample efficiency in autonomous web navigation. However, the +performance on real-world websites has still suffered from (1) open domainness, +(2) limited context length, and (3) lack of inductive bias on HTML. We +introduce WebAgent, an LLM-driven agent that can complete the tasks on real +websites following natural language instructions. WebAgent plans ahead by +decomposing instructions into canonical sub-instructions, summarizes long HTML +documents into task-relevant snippets, and acts on websites via generated +Python programs from those. We design WebAgent with Flan-U-PaLM, for grounded +code generation, and HTML-T5, new pre-trained LLMs for long HTML documents +using local and global attention mechanisms and a mixture of long-span +denoising objectives, for planning and summarization. We empirically +demonstrate that our recipe improves the success on a real website by over 50%, +and that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9% +higher success rate than prior SoTA on the MiniWoB web navigation benchmark and +better accuracy on offline task planning evaluation. + +
+
+
+
+
+ + ☆ Joint Dropout: Improving Generalizability in Low-Resource Neural Machine + Translation through Phrase Pair Variables + + +
+ Despite the tremendous success of Neural Machine Translation (NMT), its +performance on low-resource language pairs still remains subpar, partly due to +the limited ability to handle previously unseen inputs, i.e., generalization. +In this paper, we propose a method called Joint Dropout, that addresses the +challenge of low-resource neural machine translation by substituting phrases +with variables, resulting in significant enhancement of compositionality, which +is a key aspect of generalization. We observe a substantial improvement in +translation quality for language pairs with minimal resources, as seen in BLEU +and Direct Assessment scores. Furthermore, we conduct an error analysis, and +find Joint Dropout to also enhance generalizability of low-resource NMT in +terms of robustness and adaptability across different domains + +
+
+ comment: Accepted at MT Summit 2023 +
+
+
+
+
+ + ☆ Guidance in Radiology Report Summarization: An Empirical Evaluation and + Error Analysis + + +
+ Automatically summarizing radiology reports into a concise impression can +reduce the manual burden of clinicians and improve the consistency of +reporting. Previous work aimed to enhance content selection and factuality +through guided abstractive summarization. However, two key issues persist. +First, current methods heavily rely on domain-specific resources to extract the +guidance signal, limiting their transferability to domains and languages where +those resources are unavailable. Second, while automatic metrics like ROUGE +show progress, we lack a good understanding of the errors and failure modes in +this task. To bridge these gaps, we first propose a domain-agnostic guidance +signal in form of variable-length extractive summaries. Our empirical results +on two English benchmarks demonstrate that this guidance signal improves upon +unguided summarization while being competitive with domain-specific methods. +Additionally, we run an expert evaluation of four systems according to a +taxonomy of 11 fine-grained errors. We find that the most pressing differences +between automatic summaries and those of radiologists relate to content +selection including omissions (up to 52%) and additions (up to 57%). We +hypothesize that latent reporting factors and corpus-level inconsistencies may +limit models to reliably learn content selection from the available data, +presenting promising directions for future work. + +
+
+ comment: Accepted at INLG2023 +
+
+
+
+
+ + ☆ RRAML: Reinforced Retrieval Augmented Machine Learning + + +
+ The emergence of large language models (LLMs) has revolutionized machine +learning and related fields, showcasing remarkable abilities in comprehending, +generating, and manipulating human language. However, their conventional usage +through API-based text prompt submissions imposes certain limitations in terms +of context constraints and external source availability. To address these +challenges, we propose a novel framework called Reinforced Retrieval Augmented +Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs +with supporting information retrieved by a purpose-built retriever from a vast +user-provided database. By leveraging recent advancements in reinforcement +learning, our method effectively addresses several critical challenges. +Firstly, it circumvents the need for accessing LLM gradients. Secondly, our +method alleviates the burden of retraining LLMs for specific tasks, as it is +often impractical or impossible due to restricted access to the model and the +computational intensity involved. Additionally we seamlessly link the +retriever's task with the reasoner, mitigating hallucinations and reducing +irrelevant, and potentially damaging retrieved documents. We believe that the +research agenda outlined in this paper has the potential to profoundly impact +the field of AI, democratizing access to and utilization of LLMs for a wide +range of entities. + +
+
+
+
+
+ + ☆ Code-Switched Urdu ASR for Noisy Telephonic Environment using Data + Centric Approach with Hybrid HMM and CNN-TDNN + + +
+ Call Centers have huge amount of audio data which can be used for achieving +valuable business insights and transcription of phone calls is manually tedious +task. An effective Automated Speech Recognition system can accurately +transcribe these calls for easy search through call history for specific +context and content allowing automatic call monitoring, improving QoS through +keyword search and sentiment analysis. ASR for Call Center requires more +robustness as telephonic environment are generally noisy. Moreover, there are +many low-resourced languages that are on verge of extinction which can be +preserved with help of Automatic Speech Recognition Technology. Urdu is the +$10^{th}$ most widely spoken language in the world, with 231,295,440 worldwide +still remains a resource constrained language in ASR. Regional call-center +conversations operate in local language, with a mix of English numbers and +technical terms generally causing a "code-switching" problem. Hence, this paper +describes an implementation framework of a resource efficient Automatic Speech +Recognition/ Speech to Text System in a noisy call-center environment using +Chain Hybrid HMM and CNN-TDNN for Code-Switched Urdu Language. Using Hybrid +HMM-DNN approach allowed us to utilize the advantages of Neural Network with +less labelled data. Adding CNN with TDNN has shown to work better in noisy +environment due to CNN's additional frequency dimension which captures extra +information from noisy speech, thus improving accuracy. We collected data from +various open sources and labelled some of the unlabelled data after analysing +its general context and content from Urdu language as well as from commonly +used words from other languages, primarily English and were able to achieve WER +of 5.2% with noisy as well as clean environment in isolated words or numbers as +well as in continuous spontaneous speech. + +
+
+ comment: 32 pages, 19 figures, 2 tables, preprint +
+
+
+
+
+ + ☆ A Model for Every User and Budget: Label-Free and Personalized + Mixed-Precision Quantization INTERSPEECH 2023 + + +
+ Recent advancement in Automatic Speech Recognition (ASR) has produced large +AI models, which become impractical for deployment in mobile devices. Model +quantization is effective to produce compressed general-purpose models, however +such models may only be deployed to a restricted sub-domain of interest. We +show that ASR models can be personalized during quantization while relying on +just a small set of unlabelled samples from the target domain. To this end, we +propose myQASR, a mixed-precision quantization method that generates tailored +quantization schemes for diverse users under any memory requirement with no +fine-tuning. myQASR automatically evaluates the quantization sensitivity of +network layers by analysing the full-precision activation values. We are then +able to generate a personalised mixed-precision quantization scheme for any +pre-determined memory budget. Results for large-scale ASR models show how +myQASR improves performance for specific genders, languages, and speakers. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Fake News Detection Through Graph-based Neural Networks: A Survey + + +
+ The popularity of online social networks has enabled rapid dissemination of +information. People now can share and consume information much more rapidly +than ever before. However, low-quality and/or accidentally/deliberately fake +information can also spread rapidly. This can lead to considerable and negative +impacts on society. Identifying, labelling and debunking online misinformation +as early as possible has become an increasingly urgent problem. Many methods +have been proposed to detect fake news including many deep learning and +graph-based approaches. In recent years, graph-based methods have yielded +strong results, as they can closely model the social context and propagation +process of online news. In this paper, we present a systematic review of fake +news detection studies based on graph-based and deep learning-based techniques. +We classify existing graph-based methods into knowledge-driven methods, +propagation-based methods, and heterogeneous social context-based methods, +depending on how a graph structure is constructed to model news related +information flows. We further discuss the challenges and open problems in +graph-based fake news detection and identify future research directions. + +
+
+ comment: 18 pages, 3 tables, 7 figures +
+
+
+
+
+ + ☆ Tachikuma: Understading Complex Interactions with Multi-Character and + Novel Objects by Large Language Models + + +
+ Recent advancements in natural language and Large Language Models (LLMs) have +enabled AI agents to simulate human-like interactions within virtual worlds. +However, these interactions still face limitations in complexity and +flexibility, particularly in scenarios involving multiple characters and novel +objects. Pre-defining all interactable objects in the agent's world model +presents challenges, and conveying implicit intentions to multiple characters +through complex interactions remains difficult. To address these issues, we +propose integrating virtual Game Masters (GMs) into the agent's world model, +drawing inspiration from Tabletop Role-Playing Games (TRPGs). GMs play a +crucial role in overseeing information, estimating players' intentions, +providing environment descriptions, and offering feedback, compensating for +current world model deficiencies. To facilitate future explorations for complex +interactions, we introduce a benchmark named Tachikuma, comprising a Multiple +character and novel Object based interaction Estimation (MOE) task and a +supporting dataset. MOE challenges models to understand characters' intentions +and accurately determine their actions within intricate contexts involving +multi-character and novel object interactions. Besides, the dataset captures +log data from real-time communications during gameplay, providing diverse, +grounded, and complex interactions for further explorations. Finally, we +present a simple prompting baseline and evaluate its performance, demonstrating +its effectiveness in enhancing interaction understanding. We hope that our +dataset and task will inspire further research in complex interactions with +natural language, fostering the development of more advanced AI agents. + +
+
+ comment: Preliminary version of an ongoing work +
+
+
+
+
+ + ☆ Towards Generalising Neural Topical Representations + + +
+ Topic models have evolved from conventional Bayesian probabilistic models to +Neural Topic Models (NTMs) over the last two decays. Although NTMs have +achieved promising performance when trained and tested on a specific corpus, +their generalisation ability across corpora is rarely studied. In practice, we +often expect that an NTM trained on a source corpus can still produce quality +topical representation for documents in a different target corpus without +retraining. In this work, we aim to improve NTMs further so that their benefits +generalise reliably across corpora and tasks. To do so, we propose to model +similar documents by minimising their semantical distance when training NTMs. +Specifically, similar documents are created by data augmentation during +training; The semantical distance between documents is measured by the +Hierarchical Topic Transport Distance (HOTT), which computes the Optimal +Transport (OT) distance between the topical representations. Our framework can +be readily applied to most NTMs as a plug-and-play module. Extensive +experiments show that our framework significantly improves the generalisation +ability regarding neural topical representation across corpora. + +
+
+
+
+
+ + ☆ Lost In Translation: Generating Adversarial Examples Robust to + Round-Trip Translation ICASSP + + +
+ Language Models today provide a high accuracy across a large number of +downstream tasks. However, they remain susceptible to adversarial attacks, +particularly against those where the adversarial examples maintain considerable +similarity to the original text. Given the multilingual nature of text, the +effectiveness of adversarial examples across translations and how machine +translations can improve the robustness of adversarial examples remain largely +unexplored. In this paper, we present a comprehensive study on the robustness +of current text adversarial attacks to round-trip translation. We demonstrate +that 6 state-of-the-art text-based adversarial attacks do not maintain their +efficacy after round-trip translation. Furthermore, we introduce an +intervention-based solution to this problem, by integrating Machine Translation +into the process of adversarial example generation and demonstrating increased +robustness to round-trip translation. Our results indicate that finding +adversarial examples robust to translation can help identify the insufficiency +of language models that is common across languages, and motivate further +research into multilingual adversarial attacks. + +
+
+ comment: Published at International Conference on Acoustics, Speech, and + Signal Processing (ICASSP) 2023 +
+
+
+
+
+ + ☆ Investigating the Existence of "Secret Language'' in Language Models + + +
+ In this paper, we study the problem of secret language in NLP, where current +language models (LMs) seem to have a hidden vocabulary that allows them to +interpret absurd inputs as meaningful concepts. We investigate two research +questions: ``Does the secret language phenomenon exist in different language +models?'' and ``Does secret language depend on specific context?'' To answer +these questions, we introduce a novel method named \textit{SecretFinding}, a +gradient-based approach that can automatically discover secret languages in +LMs. We conduct experiments on five representative models (Electra, ALBERT, +Roberta, DistillBERT, and CLIP) finetuned on four NLP benchmarks (SST-2, MRPC, +SNLI, and SQuAD) and a language-grounding benchmark (MSCOCO). Our experimental +results show that even when we replace the most important words with others +that are semantically dissimilar to the original words in a sentence, LMs do +not consider the new sentence semantically dissimilar to the original, as the +output does not change with a high probability. This phenomenon holds true +across the five models and five tasks and gives a positive answer to the first +research question. As for the second research question, we find that the secret +language discovered by \textit{SecretFinding} is quite general and could even +be transferred to other models in the black-box settings, such as GPT-3 and +ChatGPT. Finally, we discuss the causes of secret language, how to eliminate +it, the potential connection to memorization, and ethical implications. +Examples of secret language found by SecretFinding are available on +https://huggingface.co/spaces/anonymousauthors/ACL23_SecretLanguage. + +
+
+
+
+
+ + ☆ Robust Automatic Speech Recognition via WavAugment Guided Phoneme + Adversarial Training + + +
+ Developing a practically-robust automatic speech recognition (ASR) is +challenging since the model should not only maintain the original performance +on clean samples, but also achieve consistent efficacy under small volume +perturbations and large domain shifts. To address this problem, we propose a +novel WavAugment Guided Phoneme Adversarial Training (wapat). wapat use +adversarial examples in phoneme space as augmentation to make the model +invariant to minor fluctuations in phoneme representation and preserve the +performance on clean samples. In addition, wapat utilizes the phoneme +representation of augmented samples to guide the generation of adversaries, +which helps to find more stable and diverse gradient-directions, resulting in +improved generalization. Extensive experiments demonstrate the effectiveness of +wapat on End-to-end Speech Challenge Benchmark (ESB). Notably, SpeechLM-wapat +outperforms the original model by 6.28% WER reduction on ESB, achieving the new +state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ How Do Transformers Learn Topic Structure: Towards a Mechanistic + Understanding + + +
+ While the successes of transformers across many domains are indisputable, +accurate understanding of the learning mechanics is still largely lacking. +Their capabilities have been probed on benchmarks which include a variety of +structured and reasoning tasks -- but mathematical understanding is lagging +substantially behind. Recent lines of work have begun studying representational +aspects of this question: that is, the size/depth/complexity of attention-based +networks to perform certain tasks. However, there is no guarantee the learning +dynamics will converge to the constructions proposed. In our paper, we provide +fine-grained mechanistic understanding of how transformers learn "semantic +structure", understood as capturing co-occurrence structure of words. +Precisely, we show, through a combination of mathematical analysis and +experiments on Wikipedia data and synthetic data modeled by Latent Dirichlet +Allocation (LDA), that the embedding layer and the self-attention layer encode +the topical structure. In the former case, this manifests as higher average +inner product of embeddings between same-topic words. In the latter, it +manifests as higher average pairwise attention between same-topic words. The +mathematical results involve several assumptions to make the analysis +tractable, which we verify on data, and might be of independent interest as +well. + +
+
+
+
+
+ + ♻ ☆ Classification of US Supreme Court Cases using BERT-Based Techniques + + +
+ Models based on bidirectional encoder representations from transformers +(BERT) produce state of the art (SOTA) results on many natural language +processing (NLP) tasks such as named entity recognition (NER), part-of-speech +(POS) tagging etc. An interesting phenomenon occurs when classifying long +documents such as those from the US supreme court where BERT-based models can +be considered difficult to use on a first-pass or out-of-the-box basis. In this +paper, we experiment with several BERT-based classification techniques for US +supreme court decisions or supreme court database (SCDB) and compare them with +the previous SOTA results. We then compare our results specifically with SOTA +models for long documents. We compare our results for two classification tasks: +(1) a broad classification task with 15 categories and (2) a fine-grained +classification task with 279 categories. Our best result produces an accuracy +of 80\% on the 15 broad categories and 60\% on the fine-grained 279 categories +which marks an improvement of 8\% and 28\% respectively from previously +reported SOTA results. + +
+
+
+
+
+ + ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in + Multi-Modal LLMs + + +
+ We demonstrate how images and sounds can be used for indirect prompt and +instruction injection in multi-modal LLMs. An attacker generates an adversarial +perturbation corresponding to the prompt and blends it into an image or audio +recording. When the user asks the (unmodified, benign) model about the +perturbed image or audio, the perturbation steers the model to output the +attacker-chosen text and/or make the subsequent dialog follow the attacker's +instruction. We illustrate this attack with several proof-of-concept examples +targeting LLaVa and PandaGPT. + +
+
+
+
+
+ + ♻ ☆ XTQA: Span-Level Explanations of the Textbook Question Answering + + +
+ Textbook Question Answering (TQA) is a task that one should answer a +diagram/non-diagram question given a large multi-modal context consisting of +abundant essays and diagrams. We argue that the explainability of this task +should place students as a key aspect to be considered. To address this issue, +we devise a novel architecture towards span-level eXplanations of the TQA +(XTQA) based on our proposed coarse-to-fine grained algorithm, which can +provide not only the answers but also the span-level evidences to choose them +for students. This algorithm first coarsely chooses top $M$ paragraphs relevant +to questions using the TF-IDF method, and then chooses top $K$ evidence spans +finely from all candidate spans within these paragraphs by computing the +information gain of each span to questions. Experimental results shows that +XTQA significantly improves the state-of-the-art performance compared with +baselines. The source code is available at +https://github.com/keep-smile-001/opentqa + +
+
+ comment: Accepted by IEEE TNNLS +
+
+
+
+
+ + ♻ ☆ Automatic Emotion Experiencer Recognition + + +
+ The most prominent subtask in emotion analysis is emotion classification; to +assign a category to a textual unit, for instance a social media post. Many +research questions from the social sciences do, however, not only require the +detection of the emotion of an author of a post but to understand who is +ascribed an emotion in text. This task is tackled by emotion role labeling +which aims at extracting who is described in text to experience an emotion, +why, and towards whom. This could, however, be considered overly sophisticated +if the main question to answer is who feels which emotion. A targeted approach +for such setup is to classify emotion experiencer mentions (aka "emoters") +regarding the emotion they presumably perceive. This task is similar to named +entity recognition of person names with the difference that not every mentioned +entity name is an emoter. While, very recently, data with emoter annotations +has been made available, no experiments have yet been performed to detect such +mentions. With this paper, we provide baseline experiments to understand how +challenging the task is. We further evaluate the impact on experiencer-specific +emotion categorization and appraisal detection in a pipeline, when gold +mentions are not available. We show that experiencer detection in text is a +challenging task, with a precision of .82 and a recall of .56 (F1 =.66). These +results motivate future work of jointly modeling emoter spans and +emotion/appraisal predictions. + +
+
+ comment: accepted to the CPSS workshop at KONVENS +
+
+
+
+
+ + ♻ ☆ The Next Chapter: A Study of Large Language Models in Storytelling + + +
+ To enhance the quality of generated stories, recent story generation models +have been investigating the utilization of higher-level attributes like plots +or commonsense knowledge. The application of prompt-based learning with large +language models (LLMs), exemplified by GPT-3, has exhibited remarkable +performance in diverse natural language processing (NLP) tasks. This paper +conducts a comprehensive investigation, utilizing both automatic and human +evaluation, to compare the story generation capacity of LLMs with recent models +across three datasets with variations in style, register, and length of +stories. The results demonstrate that LLMs generate stories of significantly +higher quality compared to other story generation models. Moreover, they +exhibit a level of performance that competes with human authors, albeit with +the preliminary observation that they tend to replicate real stories in +situations involving world knowledge, resembling a form of plagiarism. + +
+
+ comment: Accepted to INLG2023 +
+
+
+
+
+ + ♻ ☆ Towards autonomous system: flexible modular production system enhanced + with large language model agents + + +
+ In this paper, we present a novel framework that combines large language +models (LLMs), digital twins and industrial automation system to enable +intelligent planning and control of production processes. We retrofit the +automation system for a modular production facility and create executable +control interfaces of fine-granular functionalities and coarse-granular skills. +Low-level functionalities are executed by automation components, and high-level +skills are performed by automation modules. Subsequently, a digital twin system +is developed, registering these interfaces and containing additional +descriptive information about the production system. Based on the retrofitted +automation system and the created digital twins, LLM-agents are designed to +interpret descriptive information in the digital twins and control the physical +system through service interfaces. These LLM-agents serve as intelligent agents +on different levels within an automation system, enabling autonomous planning +and control of flexible production. Given a task instruction as input, the +LLM-agents orchestrate a sequence of atomic functionalities and skills to +accomplish the task. We demonstrate how our implemented prototype can handle +un-predefined tasks, plan a production process, and execute the operations. +This research highlights the potential of integrating LLMs into industrial +automation systems in the context of smart factory for more agile, flexible, +and adaptive production processes, while it also underscores the critical +insights and limitations for future work. Demos at: +https://github.com/YuchenXia/GPT4IndustrialAutomation + +
+
+ comment: This is the pre-print draft manuscript. The peer-reviewed version + will be published exclusively by IEEE after the conference, which is set to + take place from September 12th to 15th, 2023. We've made several improvements + to the final version of the paper based on valuable feedback and suggestions + from other researchers +
+
+
+
+
+ + ♻ ☆ Learning "O" Helps for Learning More: Handling the Concealed Entity + Problem for Class-incremental NER ACL 2023 + + +
+ As the categories of named entities rapidly increase, the deployed NER models +are required to keep updating toward recognizing more entity types, creating a +demand for class-incremental learning for NER. Considering the privacy concerns +and storage constraints, the standard paradigm for class-incremental NER +updates the models with training data only annotated with the new classes, yet +the entities from other entity classes are unlabeled, regarded as "Non-entity" +(or "O"). In this work, we conduct an empirical study on the "Unlabeled Entity +Problem" and find that it leads to severe confusion between "O" and entities, +decreasing class discrimination of old classes and declining the model's +ability to learn new classes. To solve the Unlabeled Entity Problem, we propose +a novel representation learning method to learn discriminative representations +for the entity classes and "O". Specifically, we propose an entity-aware +contrastive learning method that adaptively detects entity clusters in "O". +Furthermore, we propose two effective distance-based relabeling strategies for +better learning the old classes. We introduce a more realistic and challenging +benchmark for class-incremental NER, and the proposed method achieves up to +10.62\% improvement over the baseline methods. + +
+
+ comment: Accepted by ACL 2023 +
+
+
+
+
+ + ♻ ☆ Is ChatGPT a Biomedical Expert? -- Exploring the Zero-Shot Performance + of Current GPT Models in Biomedical Tasks + + +
+ We assessed the performance of commercial Large Language Models (LLMs) +GPT-3.5-Turbo and GPT-4 on tasks from the 2023 BioASQ challenge. In Task 11b +Phase B, which is focused on answer generation, both models demonstrated +competitive abilities with leading systems. Remarkably, they achieved this with +simple zero-shot learning, grounded with relevant snippets. Even without +relevant snippets, their performance was decent, though not on par with the +best systems. Interestingly, the older and cheaper GPT-3.5-Turbo system was +able to compete with GPT-4 in the grounded Q&A setting on factoid and list +answers. In Task 11b Phase A, focusing on retrieval, query expansion through +zero-shot learning improved performance, but the models fell short compared to +other systems. The code needed to rerun these experiments is available through +GitHub. + +
+
+ comment: Preprint accepted at the 11th BioASQ Workshop at the 14th Conference + and Labs of the Evaluation Forum (CLEF) 2023; Changes: 1. Added related work + and experimental setup sections. 2. Reworked discussion and future work + section. 3. Fixed multiple typos and improved style. Changed license +
+
+
+
+
+ + ♻ ☆ SparseGAN: Sparse Generative Adversarial Network for Text Generation + + +
+ It is still a challenging task to learn a neural text generation model under +the framework of generative adversarial networks (GANs) since the entire +training process is not differentiable. The existing training strategies either +suffer from unreliable gradient estimations or imprecise sentence +representations. Inspired by the principle of sparse coding, we propose a +SparseGAN that generates semantic-interpretable, but sparse sentence +representations as inputs to the discriminator. The key idea is that we treat +an embedding matrix as an over-complete dictionary, and use a linear +combination of very few selected word embeddings to approximate the output +feature representation of the generator at each time step. With such +semantic-rich representations, we not only reduce unnecessary noises for +efficient adversarial training, but also make the entire training process fully +differentiable. Experiments on multiple text generation datasets yield +performance improvements, especially in sequence-level metrics, such as BLEU. + +
+
+
+
+
+ + ♻ ☆ Style Classification of Rabbinic Literature for Detection of Lost + Midrash Tanhuma Material + + +
+ Midrash collections are complex rabbinic works that consist of text in +multiple languages, which evolved through long processes of unstable oral and +written transmission. Determining the origin of a given passage in such a +compilation is not always straightforward and is often a matter of dispute +among scholars, yet it is essential for scholars' understanding of the passage +and its relationship to other texts in the rabbinic corpus. To help solve this +problem, we propose a system for classification of rabbinic literature based on +its style, leveraging recent advances in natural language processing for Hebrew +texts. Additionally, we demonstrate how this method can be applied to uncover +lost material from a specific midrash genre, Tan\d{h}uma-Yelammedenu, that has +been preserved in later anthologies. + +
+
+
+
+
+ + ♻ ☆ Improving Coreference Resolution by Leveraging Entity-Centric Features + with Graph Neural Networks and Second-order Inference + + +
+ One of the major challenges in coreference resolution is how to make use of +entity-level features defined over clusters of mentions rather than mention +pairs. However, coreferent mentions usually spread far apart in an entire text, +which makes it extremely difficult to incorporate entity-level features. We +propose a graph neural network-based coreference resolution method that can +capture the entity-centric information by encouraging the sharing of features +across all mentions that probably refer to the same real-world entity. Mentions +are linked to each other via the edges modeling how likely two linked mentions +point to the same entity. Modeling by such graphs, the features between +mentions can be shared by message passing operations in an entity-centric +manner. A global inference algorithm up to second-order features is also +presented to optimally cluster mentions into consistent groups. Experimental +results show our graph neural network-based method combing with the +second-order decoding algorithm (named GNNCR) achieved close to +state-of-the-art performance on the English CoNLL-2012 Shared Task dataset. + +
+
+
+
+
+ + ♻ ☆ SpokenWOZ: A Large-Scale Speech-Text Benchmark for Spoken Task-Oriented + Dialogue Agents + + +
+ Task-oriented dialogue (TOD) models have made significant progress in recent +years. However, previous studies primarily focus on datasets written by +annotators, which has resulted in a gap between academic research and +real-world spoken conversation scenarios. While several small-scale spoken TOD +datasets are proposed to address robustness issues such as ASR errors, they +ignore the unique challenges in spoken conversation. To tackle the limitations, +we introduce SpokenWOZ, a large-scale speech-text dataset for spoken TOD, +containing 8 domains, 203k turns, 5.7k dialogues and 249 hours of audios from +human-to-human spoken conversations. SpokenWOZ further incorporates common +spoken characteristics such as word-by-word processing and reasoning in spoken +language. Based on these characteristics, we present cross-turn slot and +reasoning slot detection as new challenges. We conduct experiments on various +baselines, including text-modal models, newly proposed dual-modal models, and +LLMs, e.g., ChatGPT. The results show that the current models still have +substantial room for improvement in spoken conversation, where the most +advanced dialogue state tracker only achieves 25.65% in joint goal accuracy and +the SOTA end-to-end model only correctly completes the user request in 52.1% of +dialogues. The dataset, code, and leaderboard are available: +https://spokenwoz.github.io/SpokenWOZ-github.io/. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Summarization by Jointly Extracting Sentences and Keywords + + +
+ We present RepRank, an unsupervised graph-based ranking model for extractive +multi-document summarization in which the similarity between words, sentences, +and word-to-sentence can be estimated by the distances between their vector +representations in a unified vector space. In order to obtain desirable +representations, we propose a self-attention based learning method that +represent a sentence by the weighted sum of its word embeddings, and the +weights are concentrated to those words hopefully better reflecting the content +of a document. We show that salient sentences and keywords can be extracted in +a joint and mutual reinforcement process using our learned representations, and +prove that this process always converges to a unique solution leading to +improvement in performance. A variant of absorbing random walk and the +corresponding sampling-based algorithm are also described to avoid redundancy +and increase diversity in the summaries. Experiment results with multiple +benchmark datasets show that RepRank achieved the best or comparable +performance in ROUGE. + +
+
+ comment: 10 pages(includes 2 pages references), 1 figure +
+
+
+
+
+ + ♻ ☆ CausE: Towards Causal Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) focuses on representing the entities and +relations of a knowledge graph (KG) into the continuous vector spaces, which +can be employed to predict the missing triples to achieve knowledge graph +completion (KGC). However, KGE models often only briefly learn structural +correlations of triple data and embeddings would be misled by the trivial +patterns and noisy links in real-world KGs. To address this issue, we build the +new paradigm of KGE in the context of causality and embedding disentanglement. +We further propose a Causality-enhanced knowledge graph Embedding (CausE) +framework. CausE employs causal intervention to estimate the causal effect of +the confounder embeddings and design new training objectives to make stable +predictions. Experimental results demonstrate that CausE could outperform the +baseline models and achieve state-of-the-art KGC performance. We release our +code in https://github.com/zjukg/CausE. + +
+
+ comment: Accepted by CCKS 2023 as a research paper +
+
+
+
+
+ + ♻ ☆ Chinese Fine-Grained Financial Sentiment Analysis with Large Language + Models IJCAI 2023 + + +
+ Entity-level fine-grained sentiment analysis in the financial domain is a +crucial subtask of sentiment analysis and currently faces numerous challenges. +The primary challenge stems from the lack of high-quality and large-scale +annotated corpora specifically designed for financial text sentiment analysis, +which in turn limits the availability of data necessary for developing +effective text processing techniques. Recent advancements in large language +models (LLMs) have yielded remarkable performance in natural language +processing tasks, primarily centered around language pattern matching. In this +paper, we propose a novel and extensive Chinese fine-grained financial +sentiment analysis dataset, FinChina SA, for enterprise early warning. We +thoroughly evaluate and experiment with well-known existing open-source LLMs +using our dataset. We firmly believe that our dataset will serve as a valuable +resource to advance the exploration of real-world financial sentiment analysis +tasks, which should be the focus of future research. The FinChina SA dataset is +publicly available at https://github.com/YerayL/FinChina-SA + +
+
+ comment: FinLLM Symposium at IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation + Incorporating Gloss Information ACL 2023 + + +
+ Visual Word Sense Disambiguation (VWSD) is a task to find the image that most +accurately depicts the correct sense of the target word for the given context. +Previously, image-text matching models often suffered from recognizing +polysemous words. This paper introduces an unsupervised VWSD approach that uses +gloss information of an external lexical knowledge-base, especially the sense +definitions. Specifically, we suggest employing Bayesian inference to +incorporate the sense definitions when sense information of the answer is not +provided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we +propose a context-aware definition generation with GPT-3. Experimental results +show that the VWSD performance significantly increased with our Bayesian +inference-based approach. In addition, our context-aware definition generation +achieved prominent performance improvement in OOD examples exhibiting better +performance than the existing definition generation method. + +
+
+ comment: ACL 2023, https://aclanthology.org/2023.acl-long.88 +
+
+
+
+
+ + ♻ ☆ ODD: A Benchmark Dataset for the NLP-based Opioid Related Aberrant + Behavior Detection + + +
+ Opioid related aberrant behaviors (ORAB) present novel risk factors for +opioid overdose. Previously, ORAB have been mainly assessed by survey results +and by monitoring drug administrations. Such methods however, cannot scale up +and do not cover the entire spectrum of aberrant behaviors. On the other hand, +ORAB are widely documented in electronic health record notes. This paper +introduces a novel biomedical natural language processing benchmark dataset +named ODD, for ORAB Detection Dataset. ODD is an expert-annotated dataset +comprising of more than 750 publicly available EHR notes. ODD has been designed +to identify ORAB from patients' EHR notes and classify them into nine +categories; 1) Confirmed Aberrant Behavior, 2) Suggested Aberrant Behavior, 3) +Opioids, 4) Indication, 5) Diagnosed opioid dependency, 6) Benzodiapines, 7) +Medication Changes, 8) Central Nervous System-related, and 9) Social +Determinants of Health. We explored two state-of-the-art natural language +processing (NLP) models (finetuning pretrained language models and +prompt-tuning approaches) to identify ORAB. Experimental results show that the +prompt-tuning models outperformed the finetuning models in most cateogories and +the gains were especially higher among uncommon categories (Suggested aberrant +behavior, Diagnosed opioid dependency and Medication change). Although the best +model achieved the highest 83.92% on area under precision recall curve, +uncommon classes (Suggested Aberrant Behavior, Diagnosed Opioid Dependence, and +Medication Change) still have a large room for performance improvement. + +
+
+ comment: Under review +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 104 + +
+
+
+ + ☆ 3D-LLM: Injecting the 3D World into Large Language Models + + +
+ Large language models (LLMs) and Vision-Language Models (VLMs) have been +proven to excel at multiple tasks, such as commonsense reasoning. Powerful as +these models can be, they are not grounded in the 3D physical world, which +involves richer concepts such as spatial relationships, affordances, physics, +layout, and so on. In this work, we propose to inject the 3D world into large +language models and introduce a whole new family of 3D-LLMs. Specifically, +3D-LLMs can take 3D point clouds and their features as input and perform a +diverse set of 3D-related tasks, including captioning, dense captioning, 3D +question answering, task decomposition, 3D grounding, 3D-assisted dialog, +navigation, and so on. Using three types of prompting mechanisms that we +design, we are able to collect over 300k 3D-language data covering these tasks. +To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that +obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as +our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism, +3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show +that our model outperforms state-of-the-art baselines by a large margin (e.g., +the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore, +experiments on our held-in datasets for 3D captioning, task composition, and +3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative +examples also show that our model could perform more tasks beyond the scope of +existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/. + +
+
+ comment: Project Page: : https://vis-www.cs.umass.edu/3dllm/ +
+
+
+
+
+ + ☆ A Systematic Survey of Prompt Engineering on Vision-Language Foundation + Models + + +
+ Prompt engineering is a technique that involves augmenting a large +pre-trained model with task-specific hints, known as prompts, to adapt the +model to new tasks. Prompts can be created manually as natural language +instructions or generated automatically as either natural language instructions +or vector representations. Prompt engineering enables the ability to perform +predictions based solely on prompts without updating model parameters, and the +easier application of large pre-trained models in real-world tasks. In past +years, Prompt engineering has been well-studied in natural language processing. +Recently, it has also been intensively studied in vision-language modeling. +However, there is currently a lack of a systematic overview of prompt +engineering on pre-trained vision-language models. This paper aims to provide a +comprehensive survey of cutting-edge research in prompt engineering on three +types of vision-language models: multimodal-to-text generation models (e.g. +Flamingo), image-text matching models (e.g. CLIP), and text-to-image generation +models (e.g. Stable Diffusion). For each type of model, a brief model summary, +prompting methods, prompting-based applications, and the corresponding +responsibility and integrity issues are summarized and discussed. Furthermore, +the commonalities and differences between prompting on vision-language models, +language models, and vision models are also discussed. The challenges, future +directions, and research opportunities are summarized to foster future research +on this topic. + +
+
+
+
+
+ + ☆ DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting + + +
+ In this paper, we propose a new operator, called 3D DeFormable Attention +(DFA3D), for 2D-to-3D feature lifting, which transforms multi-view 2D image +features into a unified 3D space for 3D object detection. Existing feature +lifting approaches, such as Lift-Splat-based and 2D attention-based, either use +estimated depth to get pseudo LiDAR features and then splat them to a 3D space, +which is a one-pass operation without feature refinement, or ignore depth and +lift features by 2D attention mechanisms, which achieve finer semantics while +suffering from a depth ambiguity problem. In contrast, our DFA3D-based method +first leverages the estimated depth to expand each view's 2D feature map to 3D +and then utilizes DFA3D to aggregate features from the expanded 3D feature +maps. With the help of DFA3D, the depth ambiguity problem can be effectively +alleviated from the root, and the lifted features can be progressively refined +layer by layer, thanks to the Transformer-like architecture. In addition, we +propose a mathematically equivalent implementation of DFA3D which can +significantly improve its memory efficiency and computational speed. We +integrate DFA3D into several methods that use 2D attention-based feature +lifting with only a few modifications in code and evaluate on the nuScenes +dataset. The experiment results show a consistent improvement of +1.41\% mAP on +average, and up to +15.1\% mAP improvement when high-quality depth information +is available, demonstrating the superiority, applicability, and huge potential +of DFA3D. The code is available at +https://github.com/IDEA-Research/3D-deformable-attention.git. + +
+
+
+
+
+ + ☆ Volcanic ash delimitation using Artificial Intelligence based on Pix2Pix + + +
+ Volcanic eruptions emit ash that can be harmful to human health and cause +damage to infrastructure, economic activities and the environment. The +delimitation of ash clouds allows to know their behavior and dispersion, which +helps in the prevention and mitigation of this phenomenon. Traditional methods +take advantage of specialized software programs to process the bands or +channels that compose the satellite images. However, their use is limited to +experts and demands a lot of time and significant computational resources. In +recent years, Artificial Intelligence has been a milestone in the computational +treatment of complex problems in different areas. In particular, Deep Learning +techniques allow automatic, fast and accurate processing of digital images. The +present work proposes the use of the Pix2Pix model, a type of generative +adversarial network that, once trained, learns the mapping of input images to +output images. The architecture of such a network consisting of a generator and +a discriminator provides the versatility needed to produce black and white ash +cloud images from multispectral satellite images. The evaluation of the model, +based on loss and accuracy plots, a confusion matrix, and visual inspection, +indicates a satisfactory solution for accurate ash cloud delineation, +applicable in any area of the world and becomes a useful tool in risk +management. + +
+
+ comment: 18 pages, in Spanish language, 15 figures +
+
+
+
+
+ + ☆ Learning Dense Correspondences between Photos and Sketches ICML 2023 + + +
+ Humans effortlessly grasp the connection between sketches and real-world +objects, even when these sketches are far from realistic. Moreover, human +sketch understanding goes beyond categorization -- critically, it also entails +understanding how individual elements within a sketch correspond to parts of +the physical world it represents. What are the computational ingredients needed +to support this ability? Towards answering this question, we make two +contributions: first, we introduce a new sketch-photo correspondence benchmark, +$\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across +125 object categories, augmenting the existing Sketchy dataset with +fine-grained correspondence metadata. Second, we propose a self-supervised +method for learning dense correspondences between sketch-photo pairs, building +upon recent advances in correspondence learning for pairs of photos. Our model +uses a spatial transformer network to estimate the warp flow between latent +representations of a sketch and photo extracted by a contrastive learning-based +ConvNet backbone. We found that this approach outperformed several strong +baselines and produced predictions that were quantitatively consistent with +other warp-based methods. However, our benchmark also revealed systematic +differences between predictions of the suite of models we tested and those of +humans. Taken together, our work suggests a promising path towards developing +artificial systems that achieve more human-like understanding of visual images +at different levels of abstraction. Project page: +https://photo-sketch-correspondence.github.io + +
+
+ comment: Accepted to ICML 2023. Project page: + https://photo-sketch-correspondence.github.io +
+
+
+
+
+ + ☆ Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature + Alignment + + +
+ Text-to-video retrieval systems have recently made significant progress by +utilizing pre-trained models trained on large-scale image-text pairs. However, +most of the latest methods primarily focus on the video modality while +disregarding the audio signal for this task. Nevertheless, a recent advancement +by ECLIPSE has improved long-range text-to-video retrieval by developing an +audiovisual video representation. Nonetheless, the objective of the +text-to-video retrieval task is to capture the complementary audio and video +information that is pertinent to the text query rather than simply achieving +better audio and video alignment. To address this issue, we introduce TEFAL, a +TExt-conditioned Feature ALignment method that produces both audio and video +representations conditioned on the text query. Instead of using only an +audiovisual attention block, which could suppress the audio information +relevant to the text query, our approach employs two independent cross-modal +attention blocks that enable the text to attend to the audio and video +representations separately. Our proposed method's efficacy is demonstrated on +four benchmark datasets that include audio: MSR-VTT, LSMDC, VATEX, and +Charades, and achieves better than state-of-the-art performance consistently +across the four datasets. This is attributed to the additional +text-query-conditioned audio representation and the complementary information +it adds to the text-query-conditioned video representation. + +
+
+
+
+
+ + ☆ On Privileged and Convergent Bases in Neural Network Representations ICML 2023 + + +
+ In this study, we investigate whether the representations learned by neural +networks possess a privileged and convergent basis. Specifically, we examine +the significance of feature directions represented by individual neurons. +First, we establish that arbitrary rotations of neural representations cannot +be inverted (unlike linear networks), indicating that they do not exhibit +complete rotational invariance. Subsequently, we explore the possibility of +multiple bases achieving identical performance. To do this, we compare the +bases of networks trained with the same parameters but with varying random +initializations. Our study reveals two findings: (1) Even in wide networks such +as WideResNets, neural networks do not converge to a unique basis; (2) Basis +correlation increases significantly when a few early layers of the network are +frozen identically. + Furthermore, we analyze Linear Mode Connectivity, which has been studied as a +measure of basis correlation. Our findings give evidence that while Linear Mode +Connectivity improves with increased network width, this improvement is not due +to an increase in basis correlation. + +
+
+ comment: In the Workshop on High-dimensional Learning Dynamics at ICML 2023 +
+
+
+
+
+ + ☆ Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard + Skeleton Mining for Unsupervised Person Re-Identification + + +
+ With rapid advancements in depth sensors and deep learning, skeleton-based +person re-identification (re-ID) models have recently achieved remarkable +progress with many advantages. Most existing solutions learn single-level +skeleton features from body joints with the assumption of equal skeleton +importance, while they typically lack the ability to exploit more informative +skeleton features from various levels such as limb level with more global body +patterns. The label dependency of these methods also limits their flexibility +in learning more general skeleton representations. This paper proposes a +generic unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning +(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with +unlabeled 3D skeletons. Firstly, we construct hierarchical representations of +skeletons to model coarse-to-fine body and motion features from the levels of +body joints, components, and limbs. Then a hierarchical meta-prototype +contrastive learning model is proposed to cluster and contrast the most typical +skeleton features ("prototypes") from different-level skeletons. By converting +original prototypes into meta-prototypes with multiple homogeneous +transformations, we induce the model to learn the inherent consistency of +prototypes to capture more effective skeleton features for person re-ID. +Furthermore, we devise a hard skeleton mining mechanism to adaptively infer the +informative importance of each skeleton, so as to focus on harder skeletons to +learn more discriminative skeleton representations. Extensive evaluations on +five datasets demonstrate that our approach outperforms a wide variety of +state-of-the-art skeleton-based methods. We further show the general +applicability of our method to cross-view person re-ID and RGB-based scenarios +with estimated skeletons. + +
+
+ comment: Accepted by International Journal of Computer Vision (IJCV). Codes + are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials + will be included in the published version +
+
+
+
+
+ + ☆ Towards a Visual-Language Foundation Model for Computational Pathology + + +
+ The accelerated adoption of digital pathology and advances in deep learning +have enabled the development of powerful models for various pathology tasks +across a diverse array of diseases and patient cohorts. However, model training +is often difficult due to label scarcity in the medical domain and the model's +usage is limited by the specific task and disease for which it is trained. +Additionally, most models in histopathology leverage only image data, a stark +contrast to how humans teach each other and reason about histopathologic +entities. We introduce CONtrastive learning from Captions for Histopathology +(CONCH), a visual-language foundation model developed using diverse sources of +histopathology images, biomedical text, and notably over 1.17 million +image-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13 +diverse benchmarks, CONCH can be transferred to a wide range of downstream +tasks involving either or both histopathology images and text, achieving +state-of-the-art performance on histology image classification, segmentation, +captioning, text-to-image and image-to-text retrieval. CONCH represents a +substantial leap over concurrent visual-language pretrained systems for +histopathology, with the potential to directly facilitate a wide array of +machine learning-based workflows requiring minimal or no further supervised +fine-tuning. + +
+
+
+
+
+ + ☆ Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields + + +
+ Recently, the editing of neural radiance fields (NeRFs) has gained +considerable attention, but most prior works focus on static scenes while +research on the appearance editing of dynamic scenes is relatively lacking. In +this paper, we propose a novel framework to edit the local appearance of +dynamic NeRFs by manipulating pixels in a single frame of training video. +Specifically, to locally edit the appearance of dynamic NeRFs while preserving +unedited regions, we introduce a local surface representation of the edited +region, which can be inserted into and rendered along with the original NeRF +and warped to arbitrary other frames through a learned invertible motion +representation network. By employing our method, users without professional +expertise can easily add desired content to the appearance of a dynamic scene. +We extensively evaluate our approach on various scenes and show that our +approach achieves spatially and temporally consistent editing results. Notably, +our approach is versatile and applicable to different variants of dynamic NeRF +representations. + +
+
+ comment: project page: https://dyn-e.github.io/ +
+
+
+
+
+ + ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation + + +
+ Vision-and-language navigation (VLN) enables the agent to navigate to a +remote location following the natural language instruction in 3D environments. +To represent the previously visited environment, most approaches for VLN +implement memory using recurrent states, topological maps, or top-down semantic +maps. In contrast to these approaches, we build the top-down egocentric and +dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited +environment. From a global perspective, historical observations are projected +into a unified grid map in a top-down view, which can better represent the +spatial relations of the environment. From a local perspective, we further +propose an instruction relevance aggregation method to capture fine-grained +visual clues in each grid region. Extensive experiments are conducted on both +the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE +dataset in the continuous environments, showing the superiority of our proposed +method. + +
+
+
+
+
+ + ☆ Automotive Object Detection via Learning Sparse Events by Temporal + Dynamics of Spiking Neurons + + +
+ Event-based sensors, with their high temporal resolution (1us) and dynamical +range (120dB), have the potential to be deployed in high-speed platforms such +as vehicles and drones. However, the highly sparse and fluctuating nature of +events poses challenges for conventional object detection techniques based on +Artificial Neural Networks (ANNs). In contrast, Spiking Neural Networks (SNNs) +are well-suited for representing event-based data due to their inherent +temporal dynamics. In particular, we demonstrate that the membrane potential +dynamics can modulate network activity upon fluctuating events and strengthen +features of sparse input. In addition, the spike-triggered adaptive threshold +can stabilize training which further improves network performance. Based on +this, we develop an efficient spiking feature pyramid network for event-based +object detection. Our proposed SNN outperforms previous SNNs and sophisticated +ANNs with attention mechanisms, achieving a mean average precision (map50) of +47.7% on the Gen1 benchmark dataset. This result significantly surpasses the +previous best SNN by 9.7% and demonstrates the potential of SNNs for +event-based vision. Our model has a concise architecture while maintaining high +accuracy and much lower computation cost as a result of sparse computation. Our +code will be publicly available. + +
+
+
+
+
+ + ☆ Data-free Black-box Attack based on Diffusion Model + + +
+ Since the training data for the target model in a data-free black-box attack +is not available, most recent schemes utilize GANs to generate data for +training substitute model. However, these GANs-based schemes suffer from low +training efficiency as the generator needs to be retrained for each target +model during the substitute training process, as well as low generation +quality. To overcome these limitations, we consider utilizing the diffusion +model to generate data, and propose a data-free black-box attack scheme based +on diffusion model to improve the efficiency and accuracy of substitute +training. Despite the data generated by the diffusion model exhibits high +quality, it presents diverse domain distributions and contains many samples +that do not meet the discriminative criteria of the target model. To further +facilitate the diffusion model to generate data suitable for the target model, +we propose a Latent Code Augmentation (LCA) method to guide the diffusion model +in generating data. With the guidance of LCA, the data generated by the +diffusion model not only meets the discriminative criteria of the target model +but also exhibits high diversity. By utilizing this data, it is possible to +train substitute model that closely resemble the target model more efficiently. +Extensive experiments demonstrate that our LCA achieves higher attack success +rates and requires fewer query budgets compared to GANs-based schemes for +different target models. + +
+
+
+
+
+ + ☆ Understanding the Latent Space of Diffusion Models through the Lens of + Riemannian Geometry + + +
+ Despite the success of diffusion models (DMs), we still lack a thorough +understanding of their latent space. To understand the latent space +$\mathbf{x}_t \in \mathcal{X}$, we analyze them from a geometrical perspective. +Specifically, we utilize the pullback metric to find the local latent basis in +$\mathcal{X}$ and their corresponding local tangent basis in $\mathcal{H}$, the +intermediate feature maps of DMs. The discovered latent basis enables +unsupervised image editing capability through latent space traversal. We +investigate the discovered structure from two perspectives. First, we examine +how geometric structure evolves over diffusion timesteps. Through analysis, we +show that 1) the model focuses on low-frequency components early in the +generative process and attunes to high-frequency details later; 2) At early +timesteps, different samples share similar tangent spaces; and 3) The simpler +datasets that DMs trained on, the more consistent the tangent space for each +timestep. Second, we investigate how the geometric structure changes based on +text conditioning in Stable Diffusion. The results show that 1) similar prompts +yield comparable tangent spaces; and 2) the model depends less on text +conditions in later timesteps. To the best of our knowledge, this paper is the +first to present image editing through $\mathbf{x}$-space traversal and provide +thorough analyses of the latent structure of DMs. + +
+
+
+
+
+ + ☆ Treatment Outcome Prediction for Intracerebral Hemorrhage via Generative + Prognostic Model with Imaging and Tabular Data + + +
+ Intracerebral hemorrhage (ICH) is the second most common and deadliest form +of stroke. Despite medical advances, predicting treat ment outcomes for ICH +remains a challenge. This paper proposes a novel prognostic model that utilizes +both imaging and tabular data to predict treatment outcome for ICH. Our model +is trained on observational data collected from non-randomized controlled +trials, providing reliable predictions of treatment success. Specifically, we +propose to employ a variational autoencoder model to generate a low-dimensional +prognostic score, which can effectively address the selection bias resulting +from the non-randomized controlled trials. Importantly, we develop a +variational distributions combination module that combines the information from +imaging data, non-imaging clinical data, and treatment assignment to accurately +generate the prognostic score. We conducted extensive experiments on a +real-world clinical dataset of intracerebral hemorrhage. Our proposed method +demonstrates a substantial improvement in treatment outcome prediction compared +to existing state-of-the-art approaches. Code is available at +https://github.com/med-air/TOP-GPM + +
+
+
+
+
+ + ☆ Multiscale Video Pretraining for Long-Term Activity Forecasting + + +
+ Long-term activity forecasting is an especially challenging research problem +because it requires understanding the temporal relationships between observed +actions, as well as the variability and complexity of human activities. Despite +relying on strong supervision via expensive human annotations, state-of-the-art +forecasting approaches often generalize poorly to unseen data. To alleviate +this issue, we propose Multiscale Video Pretraining (MVP), a novel +self-supervised pretraining approach that learns robust representations for +forecasting by learning to predict contextualized representations of future +video clips over multiple timescales. MVP is based on our observation that +actions in videos have a multiscale nature, where atomic actions typically +occur at a short timescale and more complex actions may span longer timescales. +We compare MVP to state-of-the-art self-supervised video learning approaches on +downstream long-term forecasting tasks including long-term action anticipation +and video summary prediction. Our comprehensive experiments across the Ego4D +and Epic-Kitchens-55/100 datasets demonstrate that MVP out-performs +state-of-the-art methods by significant margins. Notably, MVP obtains a +relative performance gain of over 20% accuracy in video summary forecasting +over existing methods. + +
+
+
+
+
+ + ☆ Spatiotemporal Modeling Encounters 3D Medical Image Analysis: + Slice-Shift UNet with Multi-View Fusion + + +
+ As a fundamental part of computational healthcare, Computer Tomography (CT) +and Magnetic Resonance Imaging (MRI) provide volumetric data, making the +development of algorithms for 3D image analysis a necessity. Despite being +computationally cheap, 2D Convolutional Neural Networks can only extract +spatial information. In contrast, 3D CNNs can extract three-dimensional +features, but they have higher computational costs and latency, which is a +limitation for clinical practice that requires fast and efficient models. +Inspired by the field of video action recognition we propose a new 2D-based +model dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional +features at 2D CNN's complexity. More precisely multi-view features are +collaboratively learned by performing 2D convolutions along the three +orthogonal planes of a volume and imposing a weights-sharing mechanism. The +third dimension, which is neglected by the 2D convolution, is reincorporated by +shifting a portion of the feature maps along the slices' axis. The +effectiveness of our approach is validated in Multi-Modality Abdominal +Multi-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial +Vault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in +performance with state-of-the-art architectures. + +
+
+
+
+
+ + ☆ Multi-View Vertebra Localization and Identification from CT Images MICCAI 2023 + + +
+ Accurately localizing and identifying vertebrae from CT images is crucial for +various clinical applications. However, most existing efforts are performed on +3D with cropping patch operation, suffering from the large computation costs +and limited global information. In this paper, we propose a multi-view vertebra +localization and identification from CT images, converting the 3D problem into +a 2D localization and identification task on different views. Without the +limitation of the 3D cropped patch, our method can learn the multi-view global +information naturally. Moreover, to better capture the anatomical structure +information from different view perspectives, a multi-view contrastive learning +strategy is developed to pre-train the backbone. Additionally, we further +propose a Sequence Loss to maintain the sequential structure embedded along the +vertebrae. Evaluation results demonstrate that, with only two 2D networks, our +method can localize and identify vertebrae in CT images accurately, and +outperforms the state-of-the-art methods consistently. Our code is available at +https://github.com/ShanghaiTech-IMPACT/Multi-View-Vertebra-Localization-and-Identification-from-CT-Images. + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ☆ EPIC-KITCHENS-100 Unsupervised Domain Adaptation Challenge: Mixed + Sequences Prediction + + +
+ This report presents the technical details of our approach for the +EPIC-Kitchens-100 Unsupervised Domain Adaptation (UDA) Challenge in Action +Recognition. Our approach is based on the idea that the order in which actions +are performed is similar between the source and target domains. Based on this, +we generate a modified sequence by randomly combining actions from the source +and target domains. As only unlabelled target data are available under the UDA +setting, we use a standard pseudo-labeling strategy for extracting action +labels for the target. We then ask the network to predict the resulting action +sequence. This allows to integrate information from both domains during +training and to achieve better transfer results on target. Additionally, to +better incorporate sequence information, we use a language model to filter +unlikely sequences. Lastly, we employed a co-occurrence matrix to eliminate +unseen combinations of verbs and nouns. Our submission, labeled as 'sshayan', +can be found on the leaderboard, where it currently holds the 2nd position for +'verb' and the 4th position for both 'noun' and 'action'. + +
+
+ comment: 2nd place in the 2023 EPIC-KITCHENS-100 Unsupervised Domain + Adaptation Challenge for Action Recognition +
+
+
+
+
+ + ☆ Learning Provably Robust Estimators for Inverse Problems via Jittering + + +
+ Deep neural networks provide excellent performance for inverse problems such +as denoising. However, neural networks can be sensitive to adversarial or +worst-case perturbations. This raises the question of whether such networks can +be trained efficiently to be worst-case robust. In this paper, we investigate +whether jittering, a simple regularization technique that adds isotropic +Gaussian noise during training, is effective for learning worst-case robust +estimators for inverse problems. While well studied for prediction in +classification tasks, the effectiveness of jittering for inverse problems has +not been systematically investigated. In this paper, we present a novel +analytical characterization of the optimal $\ell_2$-worst-case robust estimator +for linear denoising and show that jittering yields optimal robust denoisers. +Furthermore, we examine jittering empirically via training deep neural networks +(U-nets) for natural image denoising, deconvolution, and accelerated magnetic +resonance imaging (MRI). The results show that jittering significantly enhances +the worst-case robustness, but can be suboptimal for inverse problems beyond +denoising. Moreover, our results imply that training on real data which often +contains slight noise is somewhat robustness enhancing. + +
+
+
+
+
+ + ☆ Exposing the Troublemakers in Described Object Detection + + +
+ Detecting objects based on language descriptions is a popular task that +includes Open-Vocabulary object Detection (OVD) and Referring Expression +Comprehension (REC). In this paper, we advance them to a more practical setting +called Described Object Detection (DOD) by expanding category names to flexible +language expressions for OVD and overcoming the limitation of REC to only +grounding the pre-existing object. We establish the research foundation for DOD +tasks by constructing a Description Detection Dataset ($D^3$), featuring +flexible language expressions and annotating all described objects without +omission. By evaluating previous SOTA methods on $D^3$, we find some +troublemakers that fail current REC, OVD, and bi-functional methods. REC +methods struggle with confidence scores, rejecting negative instances, and +multi-target scenarios, while OVD methods face constraints with long and +complex descriptions. Recent bi-functional methods also do not work well on DOD +due to their separated training procedures and inference strategies for REC and +OVD tasks. Building upon the aforementioned findings, we propose a baseline +that largely improves REC methods by reconstructing the training data and +introducing a binary classification sub-task, outperforming existing methods. +Data and code is available at https://github.com/shikras/d-cube. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution + for Medical Image Classification + + +
+ Graph-based neural network models are gaining traction in the field of +representation learning due to their ability to uncover latent topological +relationships between entities that are otherwise challenging to identify. +These models have been employed across a diverse range of domains, encompassing +drug discovery, protein interactions, semantic segmentation, and fluid dynamics +research. In this study, we investigate the potential of Graph Neural Networks +(GNNs) for medical image classification. We introduce a novel model that +combines GNNs and edge convolution, leveraging the interconnectedness of RGB +channel feature values to strongly represent connections between crucial graph +nodes. Our proposed model not only performs on par with state-of-the-art Deep +Neural Networks (DNNs) but does so with 1000 times fewer parameters, resulting +in reduced training time and data requirements. We compare our Graph +Convolutional Neural Network (GCNN) to pre-trained DNNs for classifying +MedMNIST dataset classes, revealing promising prospects for GNNs in medical +image analysis. Our results also encourage further exploration of advanced +graph-based models such as Graph Attention Networks (GAT) and Graph +Auto-Encoders in the medical imaging domain. The proposed model yields more +reliable, interpretable, and accurate outcomes for tasks like semantic +segmentation and image classification compared to simpler GCNNs + +
+
+
+
+
+ + ☆ Is attention all you need in medical image analysis? A review + + +
+ Medical imaging is a key component in clinical diagnosis, treatment planning +and clinical trial design, accounting for almost 90% of all healthcare data. +CNNs achieved performance gains in medical image analysis (MIA) over the last +years. CNNs can efficiently model local pixel interactions and be trained on +small-scale MI data. The main disadvantage of typical CNN models is that they +ignore global pixel relationships within images, which limits their +generalisation ability to understand out-of-distribution data with different +'global' information. The recent progress of Artificial Intelligence gave rise +to Transformers, which can learn global relationships from data. However, full +Transformer models need to be trained on large-scale data and involve +tremendous computational complexity. Attention and Transformer compartments +(Transf/Attention) which can well maintain properties for modelling global +relationships, have been proposed as lighter alternatives of full Transformers. +Recently, there is an increasing trend to co-pollinate complementary +local-global properties from CNN and Transf/Attention architectures, which led +to a new era of hybrid models. The past years have witnessed substantial growth +in hybrid CNN-Transf/Attention models across diverse MIA problems. In this +systematic review, we survey existing hybrid CNN-Transf/Attention models, +review and unravel key architectural designs, analyse breakthroughs, and +evaluate current and future opportunities as well as challenges. We also +introduced a comprehensive analysis framework on generalisation opportunities +of scientific and clinical impact, based on which new data-driven domain +generalisation and adaptation methods can be stimulated. + +
+
+
+
+
+ + ☆ Fast Full-frame Video Stabilization with Iterative Optimization ICCV2023 + + +
+ Video stabilization refers to the problem of transforming a shaky video into +a visually pleasing one. The question of how to strike a good trade-off between +visual quality and computational speed has remained one of the open challenges +in video stabilization. Inspired by the analogy between wobbly frames and +jigsaw puzzles, we propose an iterative optimization-based learning approach +using synthetic datasets for video stabilization, which consists of two +interacting submodules: motion trajectory smoothing and full-frame outpainting. +First, we develop a two-level (coarse-to-fine) stabilizing algorithm based on +the probabilistic flow field. The confidence map associated with the estimated +optical flow is exploited to guide the search for shared regions through +backpropagation. Second, we take a divide-and-conquer approach and propose a +novel multiframe fusion strategy to render full-frame stabilized views. An +important new insight brought about by our iterative optimization approach is +that the target video can be interpreted as the fixed point of nonlinear +mapping for video stabilization. We formulate video stabilization as a problem +of minimizing the amount of jerkiness in motion trajectories, which guarantees +convergence with the help of fixed-point theory. Extensive experimental results +are reported to demonstrate the superiority of the proposed approach in terms +of computational speed and visual quality. The code will be available on +GitHub. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ LiDAR Meta Depth Completion IROS 2023 + + +
+ Depth estimation is one of the essential tasks to be addressed when creating +mobile autonomous systems. While monocular depth estimation methods have +improved in recent times, depth completion provides more accurate and reliable +depth maps by additionally using sparse depth information from other sensors +such as LiDAR. However, current methods are specifically trained for a single +LiDAR sensor. As the scanning pattern differs between sensors, every new sensor +would require re-training a specialized depth completion model, which is +computationally inefficient and not flexible. Therefore, we propose to +dynamically adapt the depth completion model to the used sensor type enabling +LiDAR adaptive depth completion. Specifically, we propose a meta depth +completion network that uses data patterns derived from the data to learn a +task network to alter weights of the main depth completion network to solve a +given depth completion task effectively. The method demonstrates a strong +capability to work on multiple LiDAR scanning patterns and can also generalize +to scanning patterns that are unseen during training. While using a single +model, our method yields significantly better results than a non-adaptive +baseline trained on different LiDAR patterns. It outperforms LiDAR-specific +expert models for very sparse cases. These advantages allow flexible deployment +of a single depth completion model on different sensors, which could also prove +valuable to process the input of nascent LiDAR technology with adaptive instead +of fixed scanning patterns. + +
+
+ comment: Accepted at IROS 2023 +
+
+
+
+
+ + ☆ ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised + Real-world Single Image Super-Resolution + + +
+ Single image super-resolution (SISR) is a challenging ill-posed problem that +aims to up-sample a given low-resolution (LR) image to a high-resolution (HR) +counterpart. Due to the difficulty in obtaining real LR-HR training pairs, +recent approaches are trained on simulated LR images degraded by simplified +down-sampling operators, e.g., bicubic. Such an approach can be problematic in +practice because of the large gap between the synthesized and real-world LR +images. To alleviate the issue, we propose a novel Invertible scale-Conditional +Function (ICF), which can scale an input image and then restore the original +input with different scale conditions. By leveraging the proposed ICF, we +construct a novel self-supervised SISR framework (ICF-SRSR) to handle the +real-world SR task without using any paired/unpaired training data. +Furthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs, +which can make existing supervised SISR networks more robust. Extensive +experiments demonstrate the effectiveness of the proposed method in handling +SISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior +performance compared to the existing methods trained on synthetic paired images +in real-world scenarios and exhibits comparable performance compared to +state-of-the-art supervised/unsupervised methods on public benchmark datasets. + +
+
+
+
+
+ + ☆ CLIP-KD: An Empirical Study of Distilling CLIP Models + + +
+ CLIP has become a promising language-supervised visual pre-training framework +and achieves excellent performance over a wide range of tasks. This paper aims +to distill small CLIP models supervised by a large teacher CLIP model. We +propose several distillation strategies, including relation, feature, gradient +and contrastive paradigm, to examine the impact on CLIP distillation. We show +that the simplest feature mimicry with MSE loss performs best. Moreover, +interactive contrastive learning and relation-based distillation are also +critical in performance improvement. We apply the unified method to distill +several student networks trained on 15 million (image, text) pairs. +Distillation improves the student CLIP models consistently over zero-shot +ImageNet classification and cross-modal retrieval benchmarks. We hope our +empirical study will become an important baseline for future CLIP distillation +research. The code is available at \url{https://github.com/winycg/CLIP-KD}. + +
+
+
+
+
+ + ☆ COCO-O: A Benchmark for Object Detectors under Natural Distribution + Shifts ICCV2023 + + +
+ Practical object detection application can lose its effectiveness on image +inputs with natural distribution shifts. This problem leads the research +community to pay more attention on the robustness of detectors under +Out-Of-Distribution (OOD) inputs. Existing works construct datasets to +benchmark the detector's OOD robustness for a specific application scenario, +e.g., Autonomous Driving. However, these datasets lack universality and are +hard to benchmark general detectors built on common tasks such as COCO. To give +a more comprehensive robustness assessment, we introduce +COCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of +natural distribution shifts. COCO-O has a large distribution gap with training +data and results in a significant 55.7% relative performance drop on a Faster +R-CNN detector. We leverage COCO-O to conduct experiments on more than 100 +modern object detectors to investigate if their improvements are credible or +just over-fitting to the COCO test set. Unfortunately, most classic detectors +in early years do not exhibit strong OOD generalization. We further study the +robustness effect on recent breakthroughs of detector's architecture design, +augmentation and pre-training techniques. Some empirical findings are revealed: +1) Compared with detection head or neck, backbone is the most important part +for robustness; 2) An end-to-end detection transformer design brings no +enhancement, and may even reduce robustness; 3) Large-scale foundation models +have made a great leap on robust object detection. We hope our COCO-O could +provide a rich testbed for robustness study of object detection. The dataset +will be available at +\url{https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o}. + +
+
+ comment: To appear in ICCV2023, + https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o +
+
+
+
+
+ + ☆ Persistent-Transient Duality: A Multi-mechanism Approach for Modeling + Human-Object Interaction ICCV 2023 + + +
+ Humans are highly adaptable, swiftly switching between different modes to +progressively handle different tasks, situations and contexts. In Human-object +interaction (HOI) activities, these modes can be attributed to two mechanisms: +(1) the large-scale consistent plan for the whole activity and (2) the +small-scale children interactive actions that start and end along the timeline. +While neuroscience and cognitive science have confirmed this multi-mechanism +nature of human behavior, machine modeling approaches for human motion are +trailing behind. While attempted to use gradually morphing structures (e.g., +graph attention networks) to model the dynamic HOI patterns, they miss the +expeditious and discrete mode-switching nature of the human motion. To bridge +that gap, this work proposes to model two concurrent mechanisms that jointly +control human motion: the Persistent process that runs continually on the +global scale, and the Transient sub-processes that operate intermittently on +the local context of the human while interacting with objects. These two +mechanisms form an interactive Persistent-Transient Duality that +synergistically governs the activity sequences. We model this conceptual +duality by a parent-child neural network of Persistent and Transient channels +with a dedicated neural module for dynamic mechanism switching. The framework +is trialed on HOI motion forecasting. On two rich datasets and a wide variety +of settings, the model consistently delivers superior performances, proving its +suitability for the challenge. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ AMAE: Adaptation of Pre-Trained Masked Autoencoder for Dual-Distribution + Anomaly Detection in Chest X-Rays MICCAI 2023 + + +
+ Unsupervised anomaly detection in medical images such as chest radiographs is +stepping into the spotlight as it mitigates the scarcity of the labor-intensive +and costly expert annotation of anomaly data. However, nearly all existing +methods are formulated as a one-class classification trained only on +representations from the normal class and discard a potentially significant +portion of the unlabeled data. This paper focuses on a more practical setting, +dual distribution anomaly detection for chest X-rays, using the entire training +data, including both normal and unlabeled images. Inspired by a modern +self-supervised vision transformer model trained using partial image inputs to +reconstruct missing image regions -- we propose AMAE, a two-stage algorithm for +adaptation of the pre-trained masked autoencoder (MAE). Starting from MAE +initialization, AMAE first creates synthetic anomalies from only normal +training images and trains a lightweight classifier on frozen transformer +features. Subsequently, we propose an adaptation strategy to leverage unlabeled +images containing anomalies. The adaptation scheme is accomplished by assigning +pseudo-labels to unlabeled images and using two separate MAE based modules to +model the normative and anomalous distributions of pseudo-labeled images. The +effectiveness of the proposed adaptation strategy is evaluated with different +anomaly ratios in an unlabeled training set. AMAE leads to consistent +performance gains over competing self-supervised and dual distribution anomaly +detection methods, setting the new state-of-the-art on three public chest X-ray +benchmarks: RSNA, NIH-CXR, and VinDr-CXR. + +
+
+ comment: To be presented at MICCAI 2023 +
+
+
+
+
+ + ☆ CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle + Components + + +
+ Neural Radiance Fields (NeRFs) have gained widespread recognition as a highly +effective technique for representing 3D reconstructions of objects and scenes +derived from sets of images. Despite their efficiency, NeRF models can pose +challenges in certain scenarios such as vehicle inspection, where the lack of +sufficient data or the presence of challenging elements (e.g. reflections) +strongly impact the accuracy of the reconstruction. To this aim, we introduce +CarPatch, a novel synthetic benchmark of vehicles. In addition to a set of +images annotated with their intrinsic and extrinsic camera parameters, the +corresponding depth maps and semantic segmentation masks have been generated +for each view. Global and part-based metrics have been defined and used to +evaluate, compare, and better characterize some state-of-the-art techniques. +The dataset is publicly released at +https://aimagelab.ing.unimore.it/go/carpatch and can be used as an evaluation +guide and as a baseline for future work on this challenging topic. + +
+
+ comment: Accepted at ICIAP2023 +
+
+
+
+
+ + ☆ Dense Transformer based Enhanced Coding Network for Unsupervised Metal + Artifact Reduction + + +
+ CT images corrupted by metal artifacts have serious negative effects on +clinical diagnosis. Considering the difficulty of collecting paired data with +ground truth in clinical settings, unsupervised methods for metal artifact +reduction are of high interest. However, it is difficult for previous +unsupervised methods to retain structural information from CT images while +handling the non-local characteristics of metal artifacts. To address these +challenges, we proposed a novel Dense Transformer based Enhanced Coding Network +(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we +introduce a Hierarchical Disentangling Encoder, supported by the high-order +dense process, and transformer to obtain densely encoded sequences with +long-range correspondence. Then, we present a second-order disentanglement +method to improve the dense sequence's decoding process. Extensive experiments +and model discussions illustrate DTEC-Net's effectiveness, which outperforms +the previous state-of-the-art methods on a benchmark dataset, and greatly +reduces metal artifacts while restoring richer texture details. + +
+
+
+
+
+ + MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised + Learning of Motion and Content Features + + +
+ Self-supervised learning of visual representations has been focusing on +learning content features, which do not capture object motion or location, and +focus on identifying and differentiating objects in images and videos. On the +other hand, optical flow estimation is a task that does not involve +understanding the content of the images on which it is estimated. We unify the +two approaches and introduce MC-JEPA, a joint-embedding predictive architecture +and self-supervised learning approach to jointly learn optical flow and content +features within a shared encoder, demonstrating that the two associated +objectives; the optical flow estimation objective and the self-supervised +learning objective; benefit from each other and thus learn content features +that incorporate motion information. The proposed approach achieves performance +on-par with existing unsupervised optical flow benchmarks, as well as with +common self-supervised learning approaches on downstream tasks such as semantic +segmentation of images and videos. + +
+
+
+
+
+ + ☆ Damage Vision Mining Opportunity for Imbalanced Anomaly Detection + + +
+ In past decade, previous balanced datasets have been used to advance +algorithms for classification, object detection, semantic segmentation, and +anomaly detection in industrial applications. Specifically, for condition-based +maintenance, automating visual inspection is crucial to ensure high quality. +Deterioration prognostic attempts to optimize the fine decision process for +predictive maintenance and proactive repair. In civil infrastructure and living +environment, damage data mining cannot avoid the imbalanced data issue because +of rare unseen events and high quality status by improved operations. For +visual inspection, deteriorated class acquired from the surface of concrete and +steel components are occasionally imbalanced. From numerous related surveys, we +summarize that imbalanced data problems can be categorized into four types; 1) +missing range of target and label valuables, 2) majority-minority class +imbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class +of pixel-wise imbalance. Since 2015, there has been many imbalanced studies +using deep learning approaches that includes regression, image classification, +object detection, semantic segmentation. However, anomaly detection for +imbalanced data is not yet well known. In the study, we highlight one-class +anomaly detection application whether anomalous class or not, and demonstrate +clear examples on imbalanced vision datasets: wooden, concrete deterioration, +and disaster damage. We provide key results on damage vision mining advantage, +hypothesizing that the more effective range of positive ratio, the higher +accuracy gain of anomaly detection application. Finally, the applicability of +the damage learning methods, limitations, and future works are mentioned. + +
+
+ comment: 12 pages, 14 figures, 8 tables +
+
+
+
+
+ + ☆ Industrial Segment Anything -- a Case Study in Aircraft Manufacturing, + Intralogistics, Maintenance, Repair, and Overhaul + + +
+ Deploying deep learning-based applications in specialized domains like the +aircraft production industry typically suffers from the training data +availability problem. Only a few datasets represent non-everyday objects, +situations, and tasks. Recent advantages in research around Vision Foundation +Models (VFM) opened a new area of tasks and models with high generalization +capabilities in non-semantic and semantic predictions. As recently demonstrated +by the Segment Anything Project, exploiting VFM's zero-shot capabilities is a +promising direction in tackling the boundaries spanned by data, context, and +sensor variety. Although, investigating its application within specific domains +is subject to ongoing research. This paper contributes here by surveying +applications of the SAM in aircraft production-specific use cases. We include +manufacturing, intralogistics, as well as maintenance, repair, and overhaul +processes, also representing a variety of other neighboring industrial domains. +Besides presenting the various use cases, we further discuss the injection of +domain knowledge. + +
+
+
+
+
+ + ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked + Image Modeling + + +
+ In dynamic Magnetic Resonance Imaging (MRI), k-space is typically +undersampled due to limited scan time, resulting in aliasing artifacts in the +image domain. Hence, dynamic MR reconstruction requires not only modeling +spatial frequency components in the x and y directions of k-space but also +considering temporal redundancy. Most previous works rely on image-domain +regularizers (priors) to conduct MR reconstruction. In contrast, we focus on +interpolating the undersampled k-space before obtaining images with Fourier +transform. In this work, we connect masked image modeling with k-space +interpolation and propose a novel Transformer-based k-space Global +Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among +low- and high-frequency components of 2D+t k-space and uses it to interpolate +unsampled data. Further, we propose a novel k-space Iterative Refinement Module +(k-IRM) to enhance the high-frequency components learning. We evaluate our +approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR +reconstruction methods with image-domain regularizers. Experiments show that +our proposed k-space interpolation method quantitatively and qualitatively +outperforms baseline methods. Importantly, the proposed approach achieves +substantially higher robustness and generalizability in cases of +highly-undersampled MR data. + +
+
+
+
+
+ + ☆ A Theoretically Guaranteed Quaternion Weighted Schatten p-norm + Minimization Method for Color Image Restoration + + +
+ Inspired by the fact that the matrix formulated by nonlocal similar patches +in a natural image is of low rank, the rank approximation issue have been +extensively investigated over the past decades, among which weighted nuclear +norm minimization (WNNM) and weighted Schatten $p$-norm minimization (WSNM) are +two prevailing methods have shown great superiority in various image +restoration (IR) problems. Due to the physical characteristic of color images, +color image restoration (CIR) is often a much more difficult task than its +grayscale image counterpart. However, when applied to CIR, the traditional +WNNM/WSNM method only processes three color channels individually and fails to +consider their cross-channel correlations. Very recently, a quaternion-based +WNNM approach (QWNNM) has been developed to mitigate this issue, which is +capable of representing the color image as a whole in the quaternion domain and +preserving the inherent correlation among the three color channels. Despite its +empirical success, unfortunately, the convergence behavior of QWNNM has not +been strictly studied yet. In this paper, on the one side, we extend the WSNM +into quaternion domain and correspondingly propose a novel quaternion-based +WSNM model (QWSNM) for tackling the CIR problems. Extensive experiments on two +representative CIR tasks, including color image denoising and deblurring, +demonstrate that the proposed QWSNM method performs favorably against many +state-of-the-art alternatives, in both quantitative and qualitative +evaluations. On the other side, more importantly, we preliminarily provide a +theoretical convergence analysis, that is, by modifying the quaternion +alternating direction method of multipliers (QADMM) through a simple +continuation strategy, we theoretically prove that both the solution sequences +generated by the QWNNM and QWSNM have fixed-point convergence guarantees. + +
+
+ comment: 46 pages, 10 figures; references added +
+
+
+
+
+ + ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation + of rPPG + + +
+ Remote Photoplethysmography (rPPG) is a technology that utilizes the light +absorption properties of hemoglobin, captured via camera, to analyze and +measure blood volume pulse (BVP). By analyzing the measured BVP, various +physiological signals such as heart rate, stress levels, and blood pressure can +be derived, enabling applications such as the early prediction of +cardiovascular diseases. rPPG is a rapidly evolving field as it allows the +measurement of vital signals using camera-equipped devices without the need for +additional devices such as blood pressure monitors or pulse oximeters, and +without the assistance of medical experts. Despite extensive efforts and +advances in this field, serious challenges remain, including issues related to +skin color, camera characteristics, ambient lighting, and other sources of +noise, which degrade performance accuracy. We argue that fair and evaluable +benchmarking is urgently required to overcome these challenges and make any +meaningful progress from both academic and commercial perspectives. In most +existing work, models are trained, tested, and validated only on limited +datasets. Worse still, some studies lack available code or reproducibility, +making it difficult to fairly evaluate and compare performance. Therefore, the +purpose of this study is to provide a benchmarking framework to evaluate +various rPPG techniques across a wide range of datasets for fair evaluation and +comparison, including both conventional non-deep neural network (non-DNN) and +deep neural network (DNN) methods. GitHub URL: +https://github.com/remotebiosensing/rppg. + +
+
+ comment: 19 pages, 10 figures +
+
+
+
+
+ + ☆ PG-RCNN: Semantic Surface Point Generation for 3D Object Detection ICCV 2023 + + +
+ One of the main challenges in LiDAR-based 3D object detection is that the +sensors often fail to capture the complete spatial information about the +objects due to long distance and occlusion. Two-stage detectors with point +cloud completion approaches tackle this problem by adding more points to the +regions of interest (RoIs) with a pre-trained network. However, these methods +generate dense point clouds of objects for all region proposals, assuming that +objects always exist in the RoIs. This leads to the indiscriminate point +generation for incorrect proposals as well. Motivated by this, we propose Point +Generation R-CNN (PG-RCNN), a novel end-to-end detector that generates semantic +surface points of foreground objects for accurate detection. Our method uses a +jointly trained RoI point generation module to process the contextual +information of RoIs and estimate the complete shape and displacement of +foreground objects. For every generated point, PG-RCNN assigns a semantic +feature that indicates the estimated foreground probability. Extensive +experiments show that the point clouds generated by our method provide +geometrically and semantically rich information for refining false positive and +misaligned proposals. PG-RCNN achieves competitive performance on the KITTI +benchmark, with significantly fewer parameters than state-of-the-art models. +The code is available at https://github.com/quotation2520/PG-RCNN. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Automatic lobe segmentation using attentive cross entropy and end-to-end + fissure generation + + +
+ The automatic lung lobe segmentation algorithm is of great significance for +the diagnosis and treatment of lung diseases, however, which has great +challenges due to the incompleteness of pulmonary fissures in lung CT images +and the large variability of pathological features. Therefore, we propose a new +automatic lung lobe segmentation framework, in which we urge the model to pay +attention to the area around the pulmonary fissure during the training process, +which is realized by a task-specific loss function. In addition, we introduce +an end-to-end pulmonary fissure generation method in the auxiliary pulmonary +fissure segmentation task, without any additional network branch. Finally, we +propose a registration-based loss function to alleviate the convergence +difficulty of the Dice loss supervised pulmonary fissure segmentation task. We +achieve 97.83% and 94.75% dice scores on our private dataset STLB and public +LUNA16 dataset respectively. + +
+
+ comment: 5 pages, 3 figures, published to 'IEEE International Symposium on + Biomedical Imaging (ISBI) 2023' +
+
+
+
+
+ + ☆ Semi-Supervised Medical Image Segmentation with Co-Distribution + Alignment + + +
+ Medical image segmentation has made significant progress when a large amount +of labeled data are available. However, annotating medical image segmentation +datasets is expensive due to the requirement of professional skills. +Additionally, classes are often unevenly distributed in medical images, which +severely affects the classification performance on minority classes. To address +these problems, this paper proposes Co-Distribution Alignment (Co-DA) for +semi-supervised medical image segmentation. Specifically, Co-DA aligns marginal +predictions on unlabeled data to marginal predictions on labeled data in a +class-wise manner with two differently initialized models before using the +pseudo-labels generated by one model to supervise the other. Besides, we design +an over-expectation cross-entropy loss for filtering the unlabeled pixels to +reduce noise in their pseudo-labels. Quantitative and qualitative experiments +on three public datasets demonstrate that the proposed approach outperforms +existing state-of-the-art semi-supervised medical image segmentation methods on +both the 2D CaDIS dataset and the 3D LGE-MRI and ACDC datasets, achieving an +mIoU of 0.8515 with only 24% labeled data on CaDIS, and a Dice score of 0.8824 +and 0.8773 with only 20% data on LGE-MRI and ACDC, respectively. + +
+
+ comment: Paper appears in Bioengineering 2023, 10(7), 869 +
+
+
+
+
+ + ☆ Phase Match for Out-of-Distribution Generalization + + +
+ The Fourier transform, serving as an explicit decomposition method for visual +signals, has been employed to explain the out-of-distribution generalization +behaviors of Convolutional Neural Networks (CNNs). Previous research and +empirical studies have indicated that the amplitude spectrum plays a decisive +role in CNN recognition, but it is susceptible to disturbance caused by +distribution shifts. On the other hand, the phase spectrum preserves +highly-structured spatial information, which is crucial for visual +representation learning. In this paper, we aim to clarify the relationships +between Domain Generalization (DG) and the frequency components by introducing +a Fourier-based structural causal model. Specifically, we interpret the phase +spectrum as semi-causal factors and the amplitude spectrum as non-causal +factors. Building upon these observations, we propose Phase Match (PhaMa) to +address DG problems. Our method introduces perturbations on the amplitude +spectrum and establishes spatial relationships to match the phase components. +Through experiments on multiple benchmarks, we demonstrate that our proposed +method achieves state-of-the-art performance in domain generalization and +out-of-distribution robustness tasks. + +
+
+
+
+
+ + ☆ Sparse annotation strategies for segmentation of short axis cardiac MRI + + +
+ Short axis cardiac MRI segmentation is a well-researched topic, with +excellent results achieved by state-of-the-art models in a supervised setting. +However, annotating MRI volumes is time-consuming and expensive. Many different +approaches (e.g. transfer learning, data augmentation, few-shot learning, etc.) +have emerged in an effort to use fewer annotated data and still achieve similar +performance as a fully supervised model. Nevertheless, to the best of our +knowledge, none of these works focus on which slices of MRI volumes are most +important to annotate for yielding the best segmentation results. In this +paper, we investigate the effects of training with sparse volumes, i.e. +reducing the number of cases annotated, and sparse annotations, i.e. reducing +the number of slices annotated per case. We evaluate the segmentation +performance using the state-of-the-art nnU-Net model on two public datasets to +identify which slices are the most important to annotate. We have shown that +training on a significantly reduced dataset (48 annotated volumes) can give a +Dice score greater than 0.85 and results comparable to using the full dataset +(160 and 240 volumes for each dataset respectively). In general, training on +more slice annotations provides more valuable information compared to training +on more volumes. Further, annotating slices from the middle of volumes yields +the most beneficial results in terms of segmentation performance, and the +apical region the worst. When evaluating the trade-off between annotating +volumes against slices, annotating as many slices as possible instead of +annotating more volumes is a better strategy. + +
+
+
+
+
+ + ☆ Attribute Regularized Soft Introspective VAE: Towards Cardiac Attribute + Regularization Through MRI Domains + + +
+ Deep generative models have emerged as influential instruments for data +generation and manipulation. Enhancing the controllability of these models by +selectively modifying data attributes has been a recent focus. Variational +Autoencoders (VAEs) have shown promise in capturing hidden attributes but often +produce blurry reconstructions. Controlling these attributes through different +imaging domains is difficult in medical imaging. Recently, Soft Introspective +VAE leverage the benefits of both VAEs and Generative Adversarial Networks +(GANs), which have demonstrated impressive image synthesis capabilities, by +incorporating an adversarial loss into VAE training. In this work, we propose +the Attributed Soft Introspective VAE (Attri-SIVAE) by incorporating an +attribute regularized loss, into the Soft-Intro VAE framework. We evaluate +experimentally the proposed method on cardiac MRI data from different domains, +such as various scanner vendors and acquisition centers. The proposed method +achieves similar performance in terms of reconstruction and regularization +compared to the state-of-the-art Attributed regularized VAE but additionally +also succeeds in keeping the same regularization level when tested on a +different dataset, unlike the compared method. + +
+
+
+
+
+ + ☆ CTVIS: Consistent Training for Online Video Instance Segmentation ICCV 2023 + + +
+ The discrimination of instance embeddings plays a vital role in associating +instances across time for online video instance segmentation (VIS). Instance +embedding learning is directly supervised by the contrastive loss computed upon +the contrastive items (CIs), which are sets of anchor/positive/negative +embeddings. Recent online VIS methods leverage CIs sourced from one reference +frame only, which we argue is insufficient for learning highly discriminative +embeddings. Intuitively, a possible strategy to enhance CIs is replicating the +inference phase during training. To this end, we propose a simple yet effective +training strategy, called Consistent Training for Online VIS (CTVIS), which +devotes to aligning the training and inference pipelines in terms of building +CIs. Specifically, CTVIS constructs CIs by referring inference the +momentum-averaged embedding and the memory bank storage mechanisms, and adding +noise to the relevant embeddings. Such an extension allows a reliable +comparison between embeddings of current instances and the stable +representations of historical instances, thereby conferring an advantage in +modeling VIS challenges such as occlusion, re-identification, and deformation. +Empirically, CTVIS outstrips the SOTA VIS models by up to +5.0 points on three +VIS benchmarks, including YTVIS19 (55.1% AP), YTVIS21 (50.1% AP) and OVIS +(35.5% AP). Furthermore, we find that pseudo-videos transformed from images can +train robust models surpassing fully-supervised ones. + +
+
+ comment: Accepted by ICCV 2023. The code is available at + https://github.com/KainingYing/CTVIS +
+
+
+
+
+ + ☆ Less is More: Focus Attention for Efficient DETR ICCV2023 + + +
+ DETR-like models have significantly boosted the performance of detectors and +even outperformed classical convolutional models. However, all tokens are +treated equally without discrimination brings a redundant computational burden +in the traditional encoder structure. The recent sparsification strategies +exploit a subset of informative tokens to reduce attention complexity +maintaining performance through the sparse encoder. But these methods tend to +rely on unreliable model statistics. Moreover, simply reducing the token +population hinders the detection performance to a large extent, limiting the +application of these sparse models. We propose Focus-DETR, which focuses +attention on more informative tokens for a better trade-off between computation +efficiency and model accuracy. Specifically, we reconstruct the encoder with +dual attention, which includes a token scoring mechanism that considers both +localization and category semantic information of the objects from multi-scale +feature maps. We efficiently abandon the background queries and enhance the +semantic interaction of the fine-grained object queries based on the scores. +Compared with the state-of-the-art sparse DETR-like detectors under the same +setting, our Focus-DETR gets comparable complexity while achieving 50.4AP +(+2.2) on COCO. The code is available at +https://github.com/huawei-noah/noah-research/tree/master/Focus-DETR and +https://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR. + +
+
+ comment: 8 pages, 6 figures, accepted to ICCV2023 +
+
+
+
+
+ + ☆ ExWarp: Extrapolation and Warping-based Temporal Supersampling for + High-frequency Displays + + +
+ High-frequency displays are gaining immense popularity because of their +increasing use in video games and virtual reality applications. However, the +issue is that the underlying GPUs cannot continuously generate frames at this +high rate -- this results in a less smooth and responsive experience. +Furthermore, if the frame rate is not synchronized with the refresh rate, the +user may experience screen tearing and stuttering. Previous works propose +increasing the frame rate to provide a smooth experience on modern displays by +predicting new frames based on past or future frames. Interpolation and +extrapolation are two widely used algorithms that predict new frames. +Interpolation requires waiting for the future frame to make a prediction, which +adds additional latency. On the other hand, extrapolation provides a better +quality of experience because it relies solely on past frames -- it does not +incur any additional latency. The simplest method to extrapolate a frame is to +warp the previous frame using motion vectors; however, the warped frame may +contain improperly rendered visual artifacts due to dynamic objects -- this +makes it very challenging to design such a scheme. Past work has used DNNs to +get good accuracy, however, these approaches are slow. This paper proposes +Exwarp -- an approach based on reinforcement learning (RL) to intelligently +choose between the slower DNN-based extrapolation and faster warping-based +methods to increase the frame rate by 4x with an almost negligible reduction in +the perceived image quality. + +
+
+
+
+
+ + ☆ SwinMM: Masked Multi-view with Swin Transformers for 3D Medical Image + Segmentation MICCAI 2023 + + +
+ Recent advancements in large-scale Vision Transformers have made significant +strides in improving pre-trained models for medical image segmentation. +However, these methods face a notable challenge in acquiring a substantial +amount of pre-training data, particularly within the medical field. To address +this limitation, we present Masked Multi-view with Swin Transformers (SwinMM), +a novel multi-view pipeline for enabling accurate and data-efficient +self-supervised medical image analysis. Our strategy harnesses the potential of +multi-view information by incorporating two principal components. In the +pre-training phase, we deploy a masked multi-view encoder devised to +concurrently train masked multi-view observations through a range of diverse +proxy tasks. These tasks span image reconstruction, rotation, contrastive +learning, and a novel task that employs a mutual learning paradigm. This new +task capitalizes on the consistency between predictions from various +perspectives, enabling the extraction of hidden multi-view information from 3D +medical data. In the fine-tuning stage, a cross-view decoder is developed to +aggregate the multi-view information through a cross-attention block. Compared +with the previous state-of-the-art self-supervised learning method Swin UNETR, +SwinMM demonstrates a notable advantage on several medical image segmentation +tasks. It allows for a smooth integration of multi-view information, +significantly boosting both the accuracy and data-efficiency of the model. Code +and models are available at https://github.com/UCSC-VLAA/SwinMM/. + +
+
+ comment: MICCAI 2023; project page: https://github.com/UCSC-VLAA/SwinMM/ +
+
+
+
+
+ + ☆ SL: Stable Learning in Source-Free Domain Adaption for Medical Image + Segmentation + + +
+ Deep learning techniques for medical image analysis usually suffer from the +domain shift between source and target data. Most existing works focus on +unsupervised domain adaptation (UDA). However, in practical applications, +privacy issues are much more severe. For example, the data of different +hospitals have domain shifts due to equipment problems, and data of the two +domains cannot be available simultaneously because of privacy. In this +challenge defined as Source-Free UDA, the previous UDA medical methods are +limited. Although a variety of medical source-free unsupervised domain adaption +(MSFUDA) methods have been proposed, we found they fall into an over-fitting +dilemma called "longer training, worse performance." Therefore, we propose the +Stable Learning (SL) strategy to address the dilemma. SL is a scalable method +and can be integrated with other research, which consists of Weight +Consolidation and Entropy Increase. First, we apply Weight Consolidation to +retain domain-invariant knowledge and then we design Entropy Increase to avoid +over-learning. Comparative experiments prove the effectiveness of SL. We also +have done extensive ablation experiments. Besides, We will release codes +including a variety of MSFUDA methods. + +
+
+
+
+
+ + ☆ PRIOR: Prototype Representation Joint Learning from Medical Images and + Reports ICCV 2023 + + +
+ Contrastive learning based vision-language joint pre-training has emerged as +a successful representation learning strategy. In this paper, we present a +prototype representation learning framework incorporating both global and local +alignment between medical images and reports. In contrast to standard global +multi-modality alignment methods, we employ a local alignment module for +fine-grained representation. Furthermore, a cross-modality conditional +reconstruction module is designed to interchange information across modalities +in the training phase by reconstructing masked images and reports. For +reconstructing long reports, a sentence-wise prototype memory bank is +constructed, enabling the network to focus on low-level localized visual and +high-level clinical linguistic features. Additionally, a non-auto-regressive +generation paradigm is proposed for reconstructing non-sequential reports. +Experimental results on five downstream tasks, including supervised +classification, zero-shot classification, image-to-text retrieval, semantic +segmentation, and object detection, show the proposed method outperforms other +state-of-the-art methods across multiple datasets and under different dataset +size settings. The code is available at https://github.com/QtacierP/PRIOR. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ A Good Student is Cooperative and Reliable: CNN-Transformer + Collaborative Learning for Semantic Segmentation ICCV 2023 + + +
+ In this paper, we strive to answer the question "how to collaboratively learn +convolutional neural network (CNN)-based and vision transformer (ViT)-based +models by selecting and exchanging the reliable knowledge between them for +semantic segmentation?" Accordingly, we propose an online knowledge +distillation (KD) framework that can simultaneously learn compact yet effective +CNN-based and ViT-based models with two key technical breakthroughs to take +full advantage of CNNs and ViT while compensating their limitations. Firstly, +we propose heterogeneous feature distillation (HFD) to improve students' +consistency in low-layer feature space by mimicking heterogeneous features +between CNNs and ViT. Secondly, to facilitate the two students to learn +reliable knowledge from each other, we propose bidirectional selective +distillation (BSD) that can dynamically transfer selective knowledge. This is +achieved by 1) region-wise BSD determining the directions of knowledge +transferred between the corresponding regions in the feature space and 2) +pixel-wise BSD discerning which of the prediction knowledge to be transferred +in the logit space. Extensive experiments on three benchmark datasets +demonstrate that our proposed framework outperforms the state-of-the-art online +distillation methods by a large margin, and shows its efficacy in learning +collaboratively between ViT-based and CNN-based models. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ MataDoc: Margin and Text Aware Document Dewarping for Arbitrary Boundary + + +
+ Document dewarping from a distorted camera-captured image is of great value +for OCR and document understanding. The document boundary plays an important +role which is more evident than the inner region in document dewarping. Current +learning-based methods mainly focus on complete boundary cases, leading to poor +document correction performance of documents with incomplete boundaries. In +contrast to these methods, this paper proposes MataDoc, the first method +focusing on arbitrary boundary document dewarping with margin and text aware +regularizations. Specifically, we design the margin regularization by +explicitly considering background consistency to enhance boundary perception. +Moreover, we introduce word position consistency to keep text lines straight in +rectified document images. To produce a comprehensive evaluation of MataDoc, we +propose a novel benchmark ArbDoc, mainly consisting of document images with +arbitrary boundaries in four typical scenarios. Extensive experiments confirm +the superiority of MataDoc with consideration for the incomplete boundary on +ArbDoc and also demonstrate the effectiveness of the proposed method on +DocUNet, DIR300, and WarpDoc datasets. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Interpolating between Images with Diffusion Models ICML 2023 + + +
+ One little-explored frontier of image generation and editing is the task of +interpolating between two input images, a feature missing from all currently +deployed image generation pipelines. We argue that such a feature can expand +the creative applications of such models, and propose a method for zero-shot +interpolation using latent diffusion models. We apply interpolation in the +latent space at a sequence of decreasing noise levels, then perform denoising +conditioned on interpolated text embeddings derived from textual inversion and +(optionally) subject poses. For greater consistency, or to specify additional +criteria, we can generate several candidates and use CLIP to select the highest +quality image. We obtain convincing interpolations across diverse subject +poses, image styles, and image content, and show that standard quantitative +metrics such as FID are insufficient to measure the quality of an +interpolation. Code and data are available at +https://clintonjwang.github.io/interpolation. + +
+
+ comment: Presented at ICML 2023 Workshop on Challenges of Deploying Generative + AI +
+
+
+
+
+ + ☆ Revisiting Event-based Video Frame Interpolation IROS2023 + + +
+ Dynamic vision sensors or event cameras provide rich complementary +information for video frame interpolation. Existing state-of-the-art methods +follow the paradigm of combining both synthesis-based and warping networks. +However, few of those methods fully respect the intrinsic characteristics of +events streams. Given that event cameras only encode intensity changes and +polarity rather than color intensities, estimating optical flow from events is +arguably more difficult than from RGB information. We therefore propose to +incorporate RGB information in an event-guided optical flow refinement +strategy. Moreover, in light of the quasi-continuous nature of the time signals +provided by event cameras, we propose a divide-and-conquer strategy in which +event-based intermediate frame synthesis happens incrementally in multiple +simplified stages rather than in a single, long stage. Extensive experiments on +both synthetic and real-world datasets show that these modifications lead to +more reliable and realistic intermediate frame results than previous video +frame interpolation methods. Our findings underline that a careful +consideration of event characteristics such as high temporal density and +elevated noise benefits interpolation accuracy. + +
+
+ comment: Accepted by IROS2023 Project Site: + https://jiabenchen.github.io/revisit_event +
+
+
+
+
+ + ☆ MFMAN-YOLO: A Method for Detecting Pole-like Obstacles in Complex + Environment + + +
+ In real-world traffic, there are various uncertainties and complexities in +road and weather conditions. To solve the problem that the feature information +of pole-like obstacles in complex environments is easily lost, resulting in low +detection accuracy and low real-time performance, a multi-scale hybrid +attention mechanism detection algorithm is proposed in this paper. First, the +optimal transport function Monge-Kantorovich (MK) is incorporated not only to +solve the problem of overlapping multiple prediction frames with optimal +matching but also the MK function can be regularized to prevent model +over-fitting; then, the features at different scales are up-sampled separately +according to the optimized efficient multi-scale feature pyramid. Finally, the +extraction of multi-scale feature space channel information is enhanced in +complex environments based on the hybrid attention mechanism, which suppresses +the irrelevant complex environment background information and focuses the +feature information of pole-like obstacles. Meanwhile, this paper conducts real +road test experiments in a variety of complex environments. The experimental +results show that the detection precision, recall, and average precision of the +method are 94.7%, 93.1%, and 97.4%, respectively, and the detection frame rate +is 400 f/s. This research method can detect pole-like obstacles in a complex +road environment in real time and accurately, which further promotes innovation +and progress in the field of automatic driving. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Towards Video Anomaly Retrieval from Video Anomaly Detection: New + Benchmarks and Model + + +
+ Video anomaly detection (VAD) has been paid increasing attention due to its +potential applications, its current dominant tasks focus on online detecting +anomalies% at the frame level, which can be roughly interpreted as the binary +or multiple event classification. However, such a setup that builds +relationships between complicated anomalous events and single labels, e.g., +``vandalism'', is superficial, since single labels are deficient to +characterize anomalous events. In reality, users tend to search a specific +video rather than a series of approximate videos. Therefore, retrieving +anomalous events using detailed descriptions is practical and positive but few +researches focus on this. In this context, we propose a novel task called Video +Anomaly Retrieval (VAR), which aims to pragmatically retrieve relevant +anomalous videos by cross-modalities, e.g., language descriptions and +synchronous audios. Unlike the current video retrieval where videos are assumed +to be temporally well-trimmed with short duration, VAR is devised to retrieve +long untrimmed videos which may be partially relevant to the given query. To +achieve this, we present two large-scale VAR benchmarks, UCFCrime-AR and +XDViolence-AR, constructed on top of prevalent anomaly datasets. Meanwhile, we +design a model called Anomaly-Led Alignment Network (ALAN) for VAR. In ALAN, we +propose an anomaly-led sampling to focus on key segments in long untrimmed +videos. Then, we introduce an efficient pretext task to enhance semantic +associations between video-text fine-grained representations. Besides, we +leverage two complementary alignments to further match cross-modal contents. +Experimental results on two benchmarks reveal the challenges of VAR task and +also demonstrate the advantages of our tailored method. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Client-Level Differential Privacy via Adaptive Intermediary in Federated + Medical Imaging MICCAI'23 + + +
+ Despite recent progress in enhancing the privacy of federated learning (FL) +via differential privacy (DP), the trade-off of DP between privacy protection +and performance is still underexplored for real-world medical scenario. In this +paper, we propose to optimize the trade-off under the context of client-level +DP, which focuses on privacy during communications. However, FL for medical +imaging involves typically much fewer participants (hospitals) than other +domains (e.g., mobile devices), thus ensuring clients be differentially private +is much more challenging. To tackle this problem, we propose an adaptive +intermediary strategy to improve performance without harming privacy. +Specifically, we theoretically find splitting clients into sub-clients, which +serve as intermediaries between hospitals and the server, can mitigate the +noises introduced by DP without harming privacy. Our proposed approach is +empirically evaluated on both classification and segmentation tasks using two +public datasets, and its effectiveness is demonstrated with significant +performance improvements and comprehensive analytical studies. Code is +available at: https://github.com/med-air/Client-DP-FL. + +
+
+ comment: Accepted by 26th International Conference on Medical Image Computing + and Computer Assisted Intervention (MICCAI'23) +
+
+
+
+
+ + ☆ SelFormaly: Towards Task-Agnostic Unified Anomaly Detection + + +
+ The core idea of visual anomaly detection is to learn the normality from +normal images, but previous works have been developed specifically for certain +tasks, leading to fragmentation among various tasks: defect detection, semantic +anomaly detection, multi-class anomaly detection, and anomaly clustering. This +one-task-one-model approach is resource-intensive and incurs high maintenance +costs as the number of tasks increases. This paper presents SelFormaly, a +universal and powerful anomaly detection framework. We emphasize the necessity +of our off-the-shelf approach by pointing out a suboptimal issue with +fluctuating performance in previous online encoder-based methods. In addition, +we question the effectiveness of using ConvNets as previously employed in the +literature and confirm that self-supervised ViTs are suitable for unified +anomaly detection. We introduce back-patch masking and discover the new role of +top k-ratio feature matching to achieve unified and powerful anomaly detection. +Back-patch masking eliminates irrelevant regions that possibly hinder +target-centric detection with representations of the scene layout. The top +k-ratio feature matching unifies various anomaly levels and tasks. Finally, +SelFormaly achieves state-of-the-art results across various datasets for all +the aforementioned tasks. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Towards Generalizable Deepfake Detection by Primary Region + Regularization + + +
+ The existing deepfake detection methods have reached a bottleneck in +generalizing to unseen forgeries and manipulation approaches. Based on the +observation that the deepfake detectors exhibit a preference for overfitting +the specific primary regions in input, this paper enhances the generalization +capability from a novel regularization perspective. This can be simply achieved +by augmenting the images through primary region removal, thereby preventing the +detector from over-relying on data bias. Our method consists of two stages, +namely the static localization for primary region maps, as well as the dynamic +exploitation of primary region masks. The proposed method can be seamlessly +integrated into different backbones without affecting their inference +efficiency. We conduct extensive experiments over three widely used deepfake +datasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method +demonstrates an average performance improvement of 6% across different +backbones and performs competitively with several state-of-the-art baselines. + +
+
+ comment: 12 pages. Code and Dataset: https://github.com/xaCheng1996/PRLE +
+
+
+
+
+ + ☆ On the Connection between Pre-training Data Diversity and Fine-tuning + Robustness + + +
+ Pre-training has been widely adopted in deep learning to improve model +performance, especially when the training data for a target task is limited. In +our work, we seek to understand the implications of this training strategy on +the generalization properties of downstream models. More specifically, we ask +the following question: how do properties of the pre-training distribution +affect the robustness of a fine-tuned model? The properties we explore include +the label space, label semantics, image diversity, data domains, and data +quantity of the pre-training distribution. We find that the primary factor +influencing downstream effective robustness (Taori et al., 2020) is data +quantity, while other factors have limited significance. For example, reducing +the number of ImageNet pre-training classes by 4x while increasing the number +of images per class by 4x (that is, keeping total data quantity fixed) does not +impact the robustness of fine-tuned models. We demonstrate our findings on +pre-training distributions drawn from various natural and synthetic data +sources, primarily using the iWildCam-WILDS distribution shift as a test for +downstream robustness. + +
+
+
+
+
+ + ☆ Rethinking Medical Report Generation: Disease Revealing Enhancement with + Knowledge Graph + + +
+ Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG) +because it reveals the relations among diseases and thus can be utilized to +guide the generation process. However, constructing a comprehensive KG is +labor-intensive and its applications on the MRG process are under-explored. In +this study, we establish a complete KG on chest X-ray imaging that includes 137 +types of diseases and abnormalities. Based on this KG, we find that the current +MRG data sets exhibit a long-tailed problem in disease distribution. To +mitigate this problem, we introduce a novel augmentation strategy that enhances +the representation of disease types in the tail-end of the distribution. We +further design a two-stage MRG approach, where a classifier is first trained to +detect whether the input images exhibit any abnormalities. The classified +images are then independently fed into two transformer-based generators, +namely, ``disease-specific generator" and ``disease-free generator" to generate +the corresponding reports. To enhance the clinical evaluation of whether the +generated reports correctly describe the diseases appearing in the input image, +we propose diverse sensitivity (DS), a new metric that checks whether generated +diseases match ground truth and measures the diversity of all generated +diseases. Results show that the proposed two-stage generation framework and +augmentation strategies improve DS by a considerable margin, indicating a +notable reduction in the long-tailed problem associated with under-represented +diseases. + +
+
+
+
+
+ + ☆ Entropy Transformer Networks: A Learning Approach via Tangent Bundle + Data Manifold + + +
+ This paper focuses on an accurate and fast interpolation approach for image +transformation employed in the design of CNN architectures. Standard Spatial +Transformer Networks (STNs) use bilinear or linear interpolation as their +interpolation, with unrealistic assumptions about the underlying data +distributions, which leads to poor performance under scale variations. +Moreover, STNs do not preserve the norm of gradients in propagation due to +their dependency on sparse neighboring pixels. To address this problem, a novel +Entropy STN (ESTN) is proposed that interpolates on the data manifold +distributions. In particular, random samples are generated for each pixel in +association with the tangent space of the data manifold and construct a linear +approximation of their intensity values with an entropy regularizer to compute +the transformer parameters. A simple yet effective technique is also proposed +to normalize the non-zero values of the convolution operation, to fine-tune the +layers for gradients' norm-regularization during training. Experiments on +challenging benchmarks show that the proposed ESTN can improve predictive +accuracy over a range of computer vision tasks, including image reconstruction, +and classification, while reducing the computational cost. + +
+
+
+
+
+ + ☆ Cross Contrastive Feature Perturbation for Domain Generalization + + +
+ Domain generalization (DG) aims to learn a robust model from source domains +that generalize well on unseen target domains. Recent studies focus on +generating novel domain samples or features to diversify distributions +complementary to source domains. Yet, these approaches can hardly deal with the +restriction that the samples synthesized from various domains can cause +semantic distortion. In this paper, we propose an online one-stage Cross +Contrasting Feature Perturbation (CCFP) framework to simulate domain shift by +generating perturbed features in the latent space while regularizing the model +prediction against domain shift. Different from the previous fixed synthesizing +strategy, we design modules with learnable feature perturbations and semantic +consistency constraints. In contrast to prior work, our method does not use any +generative-based models or domain labels. We conduct extensive experiments on a +standard DomainBed benchmark with a strict evaluation protocol for a fair +comparison. Comprehensive experiments show that our method outperforms the +previous state-of-the-art, and quantitative analyses illustrate that our +approach can alleviate the domain shift problem in out-of-distribution (OOD) +scenarios. + +
+
+
+
+
+ + ☆ AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion + Models + + +
+ Unrestricted adversarial attacks present a serious threat to deep learning +models and adversarial defense techniques. They pose severe security problems +for deep learning applications because they can effectively bypass defense +mechanisms. However, previous attack methods often utilize Generative +Adversarial Networks (GANs), which are not theoretically provable and thus +generate unrealistic examples by incorporating adversarial objectives, +especially for large-scale datasets like ImageNet. In this paper, we propose a +new method, called AdvDiff, to generate unrestricted adversarial examples with +diffusion models. We design two novel adversarial guidance techniques to +conduct adversarial sampling in the reverse generation process of diffusion +models. These two techniques are effective and stable to generate high-quality, +realistic adversarial examples by integrating gradients of the target +classifier interpretably. Experimental results on MNIST and ImageNet datasets +demonstrate that AdvDiff is effective to generate unrestricted adversarial +examples, which outperforms GAN-based methods in terms of attack performance +and generation quality. + +
+
+
+
+
+ + ☆ TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition ICCV2023 + + +
+ Text-driven diffusion models have exhibited impressive generative +capabilities, enabling various image editing tasks. In this paper, we propose +TF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the +power of text-driven diffusion models for cross-domain image-guided +composition. This task aims to seamlessly integrate user-provided objects into +a specific visual context. Current diffusion-based methods often involve costly +instance-based optimization or finetuning of pretrained models on customized +datasets, which can potentially undermine their rich prior. In contrast, +TF-ICON can leverage off-the-shelf diffusion models to perform cross-domain +image-guided composition without requiring additional training, finetuning, or +optimization. Moreover, we introduce the exceptional prompt, which contains no +information, to facilitate text-driven diffusion models in accurately inverting +real images into latent representations, forming the basis for compositing. Our +experiments show that equipping Stable Diffusion with the exceptional prompt +outperforms state-of-the-art inversion methods on various datasets (CelebA-HQ, +COCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile +visual domains. Code is available at https://github.com/Shilin-LU/TF-ICON + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Rethinking Data Distillation: Do Not Overlook Calibration ICCV 2023 + + +
+ Neural networks trained on distilled data often produce over-confident output +and require correction by calibration methods. Existing calibration methods +such as temperature scaling and mixup work well for networks trained on +original large-scale data. However, we find that these methods fail to +calibrate networks trained on data distilled from large source datasets. In +this paper, we show that distilled data lead to networks that are not +calibratable due to (i) a more concentrated distribution of the maximum logits +and (ii) the loss of information that is semantically meaningful but unrelated +to classification tasks. To address this problem, we propose Masked Temperature +Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the +limitations of distilled data and achieve better calibration results while +maintaining the efficiency of dataset distillation. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Robust face anti-spoofing framework with Convolutional Vision + Transformer ICIP 2023 + + +
+ Owing to the advances in image processing technology and large-scale +datasets, companies have implemented facial authentication processes, thereby +stimulating increased focus on face anti-spoofing (FAS) against realistic +presentation attacks. Recently, various attempts have been made to improve face +recognition performance using both global and local learning on face images; +however, to the best of our knowledge, this is the first study to investigate +whether the robustness of FAS against domain shifts is improved by considering +global information and local cues in face images captured using self-attention +and convolutional layers. This study proposes a convolutional vision +transformer-based framework that achieves robust performance for various unseen +domain data. Our model resulted in 7.3%$p$ and 12.9%$p$ increases in FAS +performance compared to models using only a convolutional neural network or +vision transformer, respectively. It also shows the highest average rank in +sub-protocols of cross-dataset setting over the other nine benchmark models for +domain generalization. + +
+
+ comment: ICIP 2023 +
+
+
+
+
+ + ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge ICCV 2023 + + +
+ Panoptic segmentation methods assign a known class to each pixel given in +input. Even for state-of-the-art approaches, this inevitably enforces decisions +that systematically lead to wrong predictions for objects outside the training +categories. However, robustness against out-of-distribution samples and corner +cases is crucial in safety-critical settings to avoid dangerous consequences. +Since real-world datasets cannot contain enough data points to adequately +sample the long tail of the underlying distribution, models must be able to +deal with unseen and unknown scenarios as well. Previous methods targeted this +by re-identifying already-seen unlabeled objects. In this work, we propose the +necessary step to extend segmentation with a new setting which we term holistic +segmentation. Holistic segmentation aims to identify and separate objects of +unseen unknown categories into instances, without any prior knowledge about +them, while performing panoptic segmentation of known classes. We tackle this +new problem with U3HS, which finds unknowns as highly uncertain regions and +clusters their corresponding instance-aware embeddings into individual objects. +By doing so, for the first time in panoptic segmentation with unknown objects, +our U3HS is trained without unknown categories, reducing assumptions and +leaving the settings as unconstrained as in real-life scenarios. Extensive +experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate +the effectiveness of U3HS for this new, challenging, and assumptions-free +setting called holistic segmentation. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Generalizing similarity in noisy setups: the DIBS phenomenon ECAI 2023 + + +
+ This work uncovers an interplay among data density, noise, and the +generalization ability in similarity learning. We consider Siamese Neural +Networks (SNNs), which are the basic form of contrastive learning, and explore +two types of noise that can impact SNNs, Pair Label Noise (PLN) and Single +Label Noise (SLN). Our investigation reveals that SNNs exhibit double descent +behaviour regardless of the training setup and that it is further exacerbated +by noise. We demonstrate that the density of data pairs is crucial for +generalization. When SNNs are trained on sparse datasets with the same amount +of PLN or SLN, they exhibit comparable generalization properties. However, when +using dense datasets, PLN cases generalize worse than SLN ones in the +overparametrized region, leading to a phenomenon we call Density-Induced Break +of Similarity (DIBS). In this regime, PLN similarity violation becomes +macroscopical, corrupting the dataset to the point where complete interpolation +cannot be achieved, regardless of the number of model parameters. Our analysis +also delves into the correspondence between online optimization and offline +generalization in similarity learning. The results show that this equivalence +fails in the presence of label noise in all the scenarios considered. + +
+
+ comment: v3: version accepted at ECAI 2023 + Supplementary Material +
+
+
+
+
+ + ♻ ☆ Encyclopedic VQA: Visual questions about detailed properties of + fine-grained categories ICCV'23 + + +
+ We propose Encyclopedic-VQA, a large scale visual question answering (VQA) +dataset featuring visual questions about detailed properties of fine-grained +categories and instances. It contains 221k unique question+answer pairs each +matched with (up to) 5 images, resulting in a total of 1M VQA samples. +Moreover, our dataset comes with a controlled knowledge base derived from +Wikipedia, marking the evidence to support each answer. Empirically, we show +that our dataset poses a hard challenge for large vision+language models as +they perform poorly on our dataset: PaLI [14] is state-of-the-art on OK-VQA +[37], yet it only achieves 13.0% accuracy on our dataset. Moreover, we +experimentally show that progress on answering our encyclopedic questions can +be achieved by augmenting large models with a mechanism that retrieves relevant +information from the knowledge base. An oracle experiment with perfect +retrieval achieves 87.0% accuracy on the single-hop portion of our dataset, and +an automatic retrieval-augmented prototype yields 48.8%. We believe that our +dataset enables future research on retrieval-augmented vision+language models. +It is available at +https://github.com/google-research/google-research/tree/master/encyclopedic_vqa . + +
+
+ comment: ICCV'23 +
+
+
+
+
+ + ♻ ☆ BoxSnake: Polygonal Instance Segmentation with Box Supervision ICCV 2023 + + +
+ Box-supervised instance segmentation has gained much attention as it requires +only simple box annotations instead of costly mask or polygon annotations. +However, existing box-supervised instance segmentation models mainly focus on +mask-based frameworks. We propose a new end-to-end training technique, termed +BoxSnake, to achieve effective polygonal instance segmentation using only box +annotations for the first time. Our method consists of two loss functions: (1) +a point-based unary loss that constrains the bounding box of predicted polygons +to achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss +that encourages the predicted polygons to fit the object boundaries. Compared +with the mask-based weakly-supervised methods, BoxSnake further reduces the +performance gap between the predicted segmentation and the bounding box, and +shows significant superiority on the Cityscapes dataset. The code has been +available publicly. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Coupling a Recurrent Neural Network to SPAD TCSPC Systems for Real-time + Fluorescence Lifetime Imaging + + +
+ Fluorescence lifetime imaging (FLI) has been receiving increased attention in +recent years as a powerful diagnostic technique in biological and medical +research. However, existing FLI systems often suffer from a tradeoff between +processing speed, accuracy, and robustness. In this paper, we propose a robust +approach that enables fast FLI with no degradation of accuracy. The approach is +based on a SPAD TCSPC system coupled to a recurrent neural network (RNN) that +accurately estimates the fluorescence lifetime directly from raw timestamps +without building histograms, thereby drastically reducing transfer data volumes +and hardware resource utilization, thus enabling FLI acquisition at video rate. +We train two variants of the RNN on a synthetic dataset and compare the results +to those obtained using center-of-mass method (CMM) and least squares fitting +(LS fitting). Results demonstrate that two RNN variants, gated recurrent unit +(GRU) and long short-term memory (LSTM), are comparable to CMM and LS fitting +in terms of accuracy, while outperforming them in background noise by a large +margin. To explore the ultimate limits of the approach, we derived the +Cramer-Rao lower bound of the measurement, showing that RNN yields lifetime +estimations with near-optimal precision. Moreover, our FLI model, which is +purely trained on synthetic datasets, works well with never-seen-before, +real-world data. To demonstrate real-time operation, we have built a FLI +microscope based on Piccolo, a 32x32 SPAD sensor developed in our lab. Four +quantized GRU cores, capable of processing up to 4 million photons per second, +are deployed on a Xilinx Kintex-7 FPGA. Powered by the GRU, the FLI setup can +retrieve real-time fluorescence lifetime images at up to 10 frames per second. +The proposed FLI system is promising and ideally suited for biomedical +applications. + +
+
+
+
+
+ + ♻ ☆ Towards Saner Deep Image Registration ICCV 2023 + + +
+ With recent advances in computing hardware and surges of deep-learning +architectures, learning-based deep image registration methods have surpassed +their traditional counterparts, in terms of metric performance and inference +time. However, these methods focus on improving performance measurements such +as Dice, resulting in less attention given to model behaviors that are equally +desirable for registrations, especially for medical imaging. This paper +investigates these behaviors for popular learning-based deep registrations +under a sanity-checking microscope. We find that most existing registrations +suffer from low inverse consistency and nondiscrimination of identical pairs +due to overly optimized image similarities. To rectify these behaviors, we +propose a novel regularization-based sanity-enforcer method that imposes two +sanity checks on the deep model to reduce its inverse consistency errors and +increase its discriminative power simultaneously. Moreover, we derive a set of +theoretical guarantees for our sanity-checked image registration method, with +experimental results supporting our theoretical findings and their +effectiveness in increasing the sanity of models without sacrificing any +performance. Our code and models are available at +https://github.com/tuffr5/Saner-deep-registration. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Compound Attention and Neighbor Matching Network for Multi-contrast MRI + Super-resolution + + +
+ Multi-contrast magnetic resonance imaging (MRI) reflects information about +human tissue from different perspectives and has many clinical applications. By +utilizing the complementary information among different modalities, +multi-contrast super-resolution (SR) of MRI can achieve better results than +single-image super-resolution. However, existing methods of multi-contrast MRI +SR have the following shortcomings that may limit their performance: First, +existing methods either simply concatenate the reference and degraded features +or exploit global feature-matching between them, which are unsuitable for +multi-contrast MRI SR. Second, although many recent methods employ transformers +to capture long-range dependencies in the spatial dimension, they neglect that +self-attention in the channel dimension is also important for low-level vision +tasks. To address these shortcomings, we proposed a novel network architecture +with compound-attention and neighbor matching (CANM-Net) for multi-contrast MRI +SR: The compound self-attention mechanism effectively captures the dependencies +in both spatial and channel dimension; the neighborhood-based feature-matching +modules are exploited to match degraded features and adjacent reference +features and then fuse them to obtain the high-quality images. We conduct +experiments of SR tasks on the IXI, fastMRI, and real-world scanning datasets. +The CANM-Net outperforms state-of-the-art approaches in both retrospective and +prospective experiments. Moreover, the robustness study in our work shows that +the CANM-Net still achieves good performance when the reference and degraded +images are imperfectly registered, proving good potential in clinical +applications. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Improving Cross-Modal Retrieval with Set of Diverse Embeddings CVPR 2023 + + +
+ Cross-modal retrieval across image and text modalities is a challenging task +due to its inherent ambiguity: An image often exhibits various situations, and +a caption can be coupled with diverse images. Set-based embedding has been +studied as a solution to this problem. It seeks to encode a sample into a set +of different embedding vectors that capture different semantics of the sample. +In this paper, we present a novel set-based embedding method, which is distinct +from previous work in two aspects. First, we present a new similarity function +called smooth-Chamfer similarity, which is designed to alleviate the side +effects of existing similarity functions for set-based embedding. Second, we +propose a novel set prediction module to produce a set of embedding vectors +that effectively captures diverse semantics of input by the slot attention +mechanism. Our method is evaluated on the COCO and Flickr30K datasets across +different visual backbones, where it outperforms existing methods including +ones that demand substantially larger computation at inference. + +
+
+ comment: Accepted to CVPR 2023 (Highlight) +
+
+
+
+
+ + ♻ ☆ AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias + Estimation ECCV 2022 + + +
+ In Federated Learning (FL), a number of clients or devices collaborate to +train a model without sharing their data. Models are optimized locally at each +client and further communicated to a central hub for aggregation. While FL is +an appealing decentralized training paradigm, heterogeneity among data from +different clients can cause the local optimization to drift away from the +global objective. In order to estimate and therefore remove this drift, +variance reduction techniques have been incorporated into FL optimization +recently. However, these approaches inaccurately estimate the clients' drift +and ultimately fail to remove it properly. In this work, we propose an adaptive +algorithm that accurately estimates drift across clients. In comparison to +previous works, our approach necessitates less storage and communication +bandwidth, as well as lower compute costs. Additionally, our proposed +methodology induces stability by constraining the norm of estimates for client +drift, making it more practical for large scale FL. Experimental findings +demonstrate that the proposed algorithm converges significantly faster and +achieves higher accuracy than the baselines across various FL benchmarks. + +
+
+ comment: Published as a conference paper at ECCV 2022; Corrected some typos in + the text and a baseline algorithm +
+
+
+
+
+ + ♻ ☆ Deployment of Image Analysis Algorithms under Prevalence Shifts + + +
+ Domain gaps are among the most relevant roadblocks in the clinical +translation of machine learning (ML)-based solutions for medical image +analysis. While current research focuses on new training paradigms and network +architectures, little attention is given to the specific effect of prevalence +shifts on an algorithm deployed in practice. Such discrepancies between class +frequencies in the data used for a method's development/validation and that in +its deployment environment(s) are of great importance, for example in the +context of artificial intelligence (AI) democratization, as disease prevalences +may vary widely across time and location. Our contribution is twofold. First, +we empirically demonstrate the potentially severe consequences of missing +prevalence handling by analyzing (i) the extent of miscalibration, (ii) the +deviation of the decision threshold from the optimum, and (iii) the ability of +validation metrics to reflect neural network performance on the deployment +population as a function of the discrepancy between development and deployment +prevalence. Second, we propose a workflow for prevalence-aware image +classification that uses estimated deployment prevalences to adjust a trained +classifier to a new environment, without requiring additional annotated +deployment data. Comprehensive experiments based on a diverse set of 30 medical +classification tasks showcase the benefit of the proposed workflow in +generating better classifier decisions and more reliable performance estimates +compared to current practice. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-based Anonymization of Chest Radiographs: A + Utility-preserving Measure for Patient Privacy MICCAI 2023 + + +
+ Robust and reliable anonymization of chest radiographs constitutes an +essential step before publishing large datasets of such for research purposes. +The conventional anonymization process is carried out by obscuring personal +information in the images with black boxes and removing or replacing +meta-information. However, such simple measures retain biometric information in +the chest radiographs, allowing patients to be re-identified by a linkage +attack. Therefore, there is an urgent need to obfuscate the biometric +information appearing in the images. We propose the first deep learning-based +approach (PriCheXy-Net) to targetedly anonymize chest radiographs while +maintaining data utility for diagnostic and machine learning purposes. Our +model architecture is a composition of three independent neural networks that, +when collectively used, allow for learning a deformation field that is able to +impede patient re-identification. Quantitative results on the ChestX-ray14 +dataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC) +after re-training with little impact on the abnormality classification +performance. This indicates the ability to preserve underlying abnormality +patterns while increasing patient privacy. Lastly, we compare our proposed +anonymization approach with two other obfuscation-based methods (Privacy-Net, +DP-Pix) and demonstrate the superiority of our method towards resolving the +privacy-utility trade-off for chest radiographs. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Generalizable Embeddings with Cross-batch Metric Learning + + +
+ Global average pooling (GAP) is a popular component in deep metric learning +(DML) for aggregating features. Its effectiveness is often attributed to +treating each feature vector as a distinct semantic entity and GAP as a +combination of them. Albeit substantiated, such an explanation's algorithmic +implications to learn generalizable entities to represent unseen classes, a +crucial DML goal, remain unclear. To address this, we formulate GAP as a convex +combination of learnable prototypes. We then show that the prototype learning +can be expressed as a recursive process fitting a linear predictor to a batch +of samples. Building on that perspective, we consider two batches of disjoint +classes at each iteration and regularize the learning by expressing the samples +of a batch with the prototypes that are fitted to the other batch. We validate +our approach on 4 popular DML benchmarks. + +
+
+ comment: \c{opyright} 2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ BiofilmScanner: A Computational Intelligence Approach to Obtain + Bacterial Cell Morphological Attributes from Biofilm Image + + +
+ Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for +sulfate-reducing bacteria (SRB) that are associated with corrosion issues +caused by microorganisms. SRB-based biofilms are thought to be responsible for +the billion-dollar-per-year bio-corrosion of metal infrastructure. +Understanding the extraction of the bacterial cells' shape and size properties +in the SRB-biofilm at different growth stages will assist with the design of +anti-corrosion techniques. However, numerous issues affect current approaches, +including time-consuming geometric property extraction, low efficiency, and +high error rates. This paper proposes BiofilScanner, a Yolact-based deep +learning method integrated with invariant moments to address these problems. +Our approach efficiently detects and segments bacterial cells in an SRB image +while simultaneously invariant moments measure the geometric characteristics of +the segmented cells with low errors. The numerical experiments of the proposed +method demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our +earlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring +the geometric properties of the cell. Furthermore, the BiofilmScanner achieved +an F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67% +and 75.18%, respectively. + +
+
+ comment: Submitted to Pattern Recognition +
+
+
+
+
+ + ♻ ☆ NeRF-GAN Distillation for Efficient 3D-Aware Generation with + Convolutions + + +
+ Pose-conditioned convolutional generative models struggle with high-quality +3D-consistent image generation from single-view datasets, due to their lack of +sufficient 3D priors. Recently, the integration of Neural Radiance Fields +(NeRFs) and generative models, such as Generative Adversarial Networks (GANs), +has transformed 3D-aware generation from single-view images. NeRF-GANs exploit +the strong inductive bias of neural 3D representations and volumetric rendering +at the cost of higher computational complexity. This study aims at revisiting +pose-conditioned 2D GANs for efficient 3D-aware generation at inference time by +distilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and +effective method, based on re-using the well-disentangled latent space of a +pre-trained NeRF-GAN in a pose-conditioned convolutional network to directly +generate 3D-consistent images corresponding to the underlying 3D +representations. Experiments on several datasets demonstrate that the proposed +method obtains results comparable with volumetric rendering in terms of quality +and 3D consistency while benefiting from the computational advantage of +convolutional networks. The code will be available at: +https://github.com/mshahbazi72/NeRF-GAN-Distillation + +
+
+
+
+
+ + ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed + Tomography via Deep Convolutional Neural Network based Artifact Reduction + + +
+ Purpose: Sparse-view computed tomography (CT) is an effective way to reduce +dose by lowering the total number of views acquired, albeit at the expense of +image quality, which, in turn, can impact the ability to detect diseases. We +explore deep learning-based artifact reduction in sparse-view cranial CT scans +and its impact on automated hemorrhage detection. Methods: We trained a U-Net +for artefact reduction on simulated sparse-view cranial CT scans from 3000 +patients obtained from a public dataset and reconstructed with varying levels +of sub-sampling. Additionally, we trained a convolutional neural network on +fully sampled CT data from 17,545 patients for automated hemorrhage detection. +We evaluated the classification performance using the area under the receiver +operator characteristic curves (AUC-ROCs) with corresponding 95% confidence +intervals (CIs) and the DeLong test, along with confusion matrices. The +performance of the U-Net was compared to an analytical approach based on total +variation (TV). Results: The U-Net performed superior compared to unprocessed +and TV-processed images with respect to image quality and automated hemorrhage +diagnosis. With U-Net post-processing, the number of views can be reduced from +4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973; +0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256 +views (0.967; 0.964-0.969) with a slight performance decrease (P<.001). +Conclusion: The results suggest that U-Net based artifact reduction +substantially enhances automated hemorrhage detection in sparse-view cranial +CTs. Our findings highlight that appropriate post-processing is crucial for +optimal image quality and diagnostic accuracy while minimizing radiation dose. + +
+
+ comment: 11 pages, 6 figures, 1 table +
+
+
+
+
+ + ♻ ☆ UP-DETR: Unsupervised Pre-training for Object Detection with + Transformers CVPR 2021 + + +
+ DEtection TRansformer (DETR) for object detection reaches competitive +performance compared with Faster R-CNN via a transformer encoder-decoder +architecture. However, trained with scratch transformers, DETR needs +large-scale training data and an extreme long training schedule even on COCO +dataset. Inspired by the great success of pre-training transformers in natural +language processing, we propose a novel pretext task named random query patch +detection in Unsupervised Pre-training DETR (UP-DETR). Specifically, we +randomly crop patches from the given image and then feed them as queries to the +decoder. The model is pre-trained to detect these query patches from the input +image. During the pre-training, we address two critical issues: multi-task +learning and multi-query localization. (1) To trade off classification and +localization preferences in the pretext task, we find that freezing the CNN +backbone is the prerequisite for the success of pre-training transformers. (2) +To perform multi-query localization, we develop UP-DETR with multi-query patch +detection with attention mask. Besides, UP-DETR also provides a unified +perspective for fine-tuning object detection and one-shot detection tasks. In +our experiments, UP-DETR significantly boosts the performance of DETR with +faster convergence and higher average precision on object detection, one-shot +detection and panoptic segmentation. Code and pre-training models: +https://github.com/dddzg/up-detr. + +
+
+ comment: Accepted by TPAMI 2022 and CVPR 2021 +
+
+
+
+
+ + ♻ ☆ Contrastive Learning and the Emergence of Attributes Associations + + +
+ In response to an object presentation, supervised learning schemes generally +respond with a parsimonious label. Upon a similar presentation we humans +respond again with a label, but are flooded, in addition, by a myriad of +associations. A significant portion of these consist of the presented object +attributes. Contrastive learning is a semi-supervised learning scheme based on +the application of identity preserving transformations on the object input +representations. It is conjectured in this work that these same applied +transformations preserve, in addition to the identity of the presented object, +also the identity of its semantically meaningful attributes. The corollary of +this is that the output representations of such a contrastive learning scheme +contain valuable information not only for the classification of the presented +object, but also for the presence or absence decision of any attribute of +interest. Simulation results which demonstrate this idea and the feasibility of +this conjecture are presented. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Dr. KID: Direct Remeshing and K-set Isometric Decomposition for Scalable + Physicalization of Organic Shapes + + +
+ Dr. KID is an algorithm that uses isometric decomposition for the +physicalization of potato-shaped organic models in a puzzle fashion. The +algorithm begins with creating a simple, regular triangular surface mesh of +organic shapes, followed by iterative k-means clustering and remeshing. For +clustering, we need similarity between triangles (segments) which is defined as +a distance function. The distance function maps each triangle's shape to a +single point in the virtual 3D space. Thus, the distance between the triangles +indicates their degree of dissimilarity. K-means clustering uses this distance +and sorts of segments into k classes. After this, remeshing is applied to +minimize the distance between triangles within the same cluster by making their +shapes identical. Clustering and remeshing are repeated until the distance +between triangles in the same cluster reaches an acceptable threshold. We adopt +a curvature-aware strategy to determine the surface thickness and finalize +puzzle pieces for 3D printing. Identical hinges and holes are created for +assembling the puzzle components. For smoother outcomes, we use triangle +subdivision along with curvature-aware clustering, generating curved triangular +patches for 3D printing. Our algorithm was evaluated using various models, and +the 3D-printed results were analyzed. Findings indicate that our algorithm +performs reliably on target organic shapes with minimal loss of input geometry. + +
+
+
+
+
+ + ♻ ☆ Fusing Structure from Motion and Simulation-Augmented Pose Regression + from Optical Flow for Challenging Indoor Environments + + +
+ The localization of objects is a crucial task in various applications such as +robotics, virtual and augmented reality, and the transportation of goods in +warehouses. Recent advances in deep learning have enabled the localization +using monocular visual cameras. While structure from motion (SfM) predicts the +absolute pose from a point cloud, absolute pose regression (APR) methods learn +a semantic understanding of the environment through neural networks. However, +both fields face challenges caused by the environment such as motion blur, +lighting changes, repetitive patterns, and feature-less structures. This study +aims to address these challenges by incorporating additional information and +regularizing the absolute pose using relative pose regression (RPR) methods. +RPR methods suffer under different challenges, i.e., motion blur. The optical +flow between consecutive images is computed using the Lucas-Kanade algorithm, +and the relative pose is predicted using an auxiliary small recurrent +convolutional network. The fusion of absolute and relative poses is a complex +task due to the mismatch between the global and local coordinate systems. +State-of-the-art methods fusing absolute and relative poses use pose graph +optimization (PGO) to regularize the absolute pose predictions using relative +poses. In this work, we propose recurrent fusion networks to optimally align +absolute and relative pose predictions to improve the absolute pose prediction. +We evaluate eight different recurrent units and construct a simulation +environment to pre-train the APR and RPR networks for better generalized +training. Additionally, we record a large database of different scenarios in a +challenging large-scale indoor environment that mimics a warehouse with +transportation robots. We conduct hyperparameter searches and experiments to +show the effectiveness of our recurrent fusion method compared to PGO. + +
+
+
+
+
+ + ♻ ☆ Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model using + Pixel-aligned Reconstruction Priors ICCV 2023 + + +
+ Fast generation of high-quality 3D digital humans is important to a vast +number of applications ranging from entertainment to professional concerns. +Recent advances in differentiable rendering have enabled the training of 3D +generative models without requiring 3D ground truths. However, the quality of +the generated 3D humans still has much room to improve in terms of both +fidelity and diversity. In this paper, we present Get3DHuman, a novel 3D human +framework that can significantly boost the realism and diversity of the +generated outcomes by only using a limited budget of 3D ground-truth data. Our +key observation is that the 3D generator can profit from human-related priors +learned through 2D human generators and 3D reconstructors. Specifically, we +bridge the latent space of Get3DHuman with that of StyleGAN-Human via a +specially-designed prior network, where the input latent code is mapped to the +shape and texture feature volumes spanned by the pixel-aligned 3D +reconstructor. The outcomes of the prior network are then leveraged as the +supervisory signals for the main generator network. To ensure effective +training, we further propose three tailored losses applied to the generated +feature volumes and the intermediate feature maps. Extensive experiments +demonstrate that Get3DHuman greatly outperforms the other state-of-the-art +approaches and can support a wide range of applications including shape +interpolation, shape re-texturing, and single-view reconstruction through +latent inversion. + +
+
+ comment: ICCV 2023, project page: + https://x-zhangyang.github.io/2023_Get3DHuman/ +
+
+
+
+
+ + ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly + Identification + + +
+ Failure to recognize samples from the classes unseen during training is a +major limitation of artificial intelligence in the real-world implementation +for recognition and classification of retinal anomalies. We established an +uncertainty-inspired open-set (UIOS) model, which was trained with fundus +images of 9 retinal conditions. Besides assessing the probability of each +category, UIOS also calculated an uncertainty score to express its confidence. +Our UIOS model with thresholding strategy achieved an F1 score of 99.55%, +97.01% and 91.91% for the internal testing set, external target categories +(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1 +score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS +correctly predicted high uncertainty scores, which would prompt the need for a +manual check in the datasets of non-target categories retinal diseases, +low-quality fundus images, and non-fundus images. UIOS provides a robust method +for real-world screening of retinal anomalies. + +
+
+
+
+
+ + ♻ ☆ Morphological Image Analysis and Feature Extraction for Reasoning with + AI-based Defect Detection and Classification Models SC + + +
+ As the use of artificial intelligent (AI) models becomes more prevalent in +industries such as engineering and manufacturing, it is essential that these +models provide transparent reasoning behind their predictions. This paper +proposes the AI-Reasoner, which extracts the morphological characteristics of +defects (DefChars) from images and utilises decision trees to reason with the +DefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e. +charts) and textual explanations to provide insights into outputs made by +masked-based defect detection and classification models. It also provides +effective mitigation strategies to enhance data pre-processing and overall +model performance. The AI-Reasoner was tested on explaining the outputs of an +IE Mask R-CNN model using a set of 366 images containing defects. The results +demonstrated its effectiveness in explaining the IE Mask R-CNN model's +predictions. Overall, the proposed AI-Reasoner provides a solution for +improving the performance of AI models in industrial applications that require +defect analysis. + +
+
+ comment: 8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series + on computational intelligence (SSCI) +
+
+
+
+
+ + ♻ ☆ S3M: Scalable Statistical Shape Modeling through Unsupervised + Correspondences MICCAI 2023 + + +
+ Statistical shape models (SSMs) are an established way to represent the +anatomy of a population with various clinically relevant applications. However, +they typically require domain expertise, and labor-intensive landmark +annotations to construct. We address these shortcomings by proposing an +unsupervised method that leverages deep geometric features and functional +correspondences to simultaneously learn local and global shape structures +across population anatomies. Our pipeline significantly improves unsupervised +correspondence estimation for SSMs compared to baseline methods, even on highly +irregular surface topologies. We demonstrate this for two different anatomical +structures: the thyroid and a multi-chamber heart dataset. Furthermore, our +method is robust enough to learn from noisy neural network predictions, +potentially enabling scaling SSMs to larger patient populations without manual +segmentation annotation. + +
+
+ comment: Accepted at MICCAI 2023. 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ ADPS: Asymmetric Distillation Post-Segmentation for Image Anomaly + Detection + + +
+ Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the +teacher-student paradigm to detect and segment anomalous regions by contrasting +the unique features extracted by both networks. However, existing KDAD methods +suffer from two main limitations: 1) the student network can effortlessly +replicate the teacher network's representations, and 2) the features of the +teacher network serve solely as a ``reference standard" and are not fully +leveraged. Toward this end, we depart from the established paradigm and instead +propose an innovative approach called Asymmetric Distillation Post-Segmentation +(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes +distinct forms of the same image as the input of the teacher-student networks, +driving the student network to learn discriminating representations for +anomalous regions. + Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a +coarse anomaly localization mask that transfers the distilled knowledge +acquired from the asymmetric paradigm to the teacher network. Equipped with +WMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect +and segment abnormal regions with fine structures and clear boundaries. +Experimental results demonstrate that the proposed ADPS outperforms the +state-of-the-art methods in detecting and segmenting anomalies. Surprisingly, +ADPS significantly improves Average Precision (AP) metric by 9% and 20% on the +MVTec AD and KolektorSDD2 datasets, respectively. + +
+
+ comment: 11pages,9 figures +
+
+
+
+
+ + ♻ ☆ Recovering 3D Human Mesh from Monocular Images: A Survey + + +
+ Estimating human pose and shape from monocular images is a long-standing +problem in computer vision. Since the release of statistical body models, 3D +human mesh recovery has been drawing broader attention. With the same goal of +obtaining well-aligned and physically plausible mesh results, two paradigms +have been developed to overcome challenges in the 2D-to-3D lifting process: i) +an optimization-based paradigm, where different data terms and regularization +terms are exploited as optimization objectives; and ii) a regression-based +paradigm, where deep learning techniques are embraced to solve the problem in +an end-to-end fashion. Meanwhile, continuous efforts are devoted to improving +the quality of 3D mesh labels for a wide range of datasets. Though remarkable +progress has been achieved in the past decade, the task is still challenging +due to flexible body motions, diverse appearances, complex environments, and +insufficient in-the-wild annotations. To the best of our knowledge, this is the +first survey that focuses on the task of monocular 3D human mesh recovery. We +start with the introduction of body models and then elaborate recovery +frameworks and training objectives by providing in-depth analyses of their +strengths and weaknesses. We also summarize datasets, evaluation metrics, and +benchmark results. Open issues and future directions are discussed in the end, +hoping to motivate researchers and facilitate their research in this area. A +regularly updated project page can be found at +https://github.com/tinatiansjz/hmr-survey. + +
+
+ comment: Accepted to IEEE TPAMI, Survey on monocular 3D human mesh recovery, + Project page: https://github.com/tinatiansjz/hmr-survey +
+
+
+
+
+ + ♻ ☆ Underwater Object Tracker: UOSTrack for Marine Organism Grasping of + Underwater Vehicles + + +
+ A visual single-object tracker is an indispensable component of underwater +vehicles (UVs) in marine organism grasping tasks. Its accuracy and stability +are imperative to guide the UVs to perform grasping behavior. Although +single-object trackers show competitive performance in the challenge of +underwater image degradation, there are still issues with sample imbalance and +exclusion of similar objects that need to be addressed for application in +marine organism grasping. This paper proposes Underwater OSTrack (UOSTrack), +which consists of underwater image and open-air sequence hybrid training +(UOHT), and motion-based post-processing (MBPP). The UOHT training paradigm is +designed to train the sample-imbalanced underwater tracker so that the tracker +is exposed to a great number of underwater domain training samples and learns +the feature expressions. The MBPP paradigm is proposed to exclude similar +objects. It uses the estimation box predicted with a Kalman filter and the +candidate boxes in the response map to relocate the lost tracked object in the +candidate area. UOSTrack achieves an average performance improvement of 4.41% +and 7.98% maximum compared to state-of-the-art methods on various benchmarks, +respectively. Field experiments have verified the accuracy and stability of our +proposed UOSTrack for UVs in marine organism grasping tasks. More details can +be found at https://github.com/LiYunfengLYF/UOSTrack. + +
+
+
+
+
+ + ♻ ☆ DiffusionDepth: Diffusion Denoising Approach for Monocular Depth + Estimation + + +
+ Monocular depth estimation is a challenging task that predicts the pixel-wise +depth from a single 2D image. Current methods typically model this problem as a +regression or classification task. We propose DiffusionDepth, a new approach +that reformulates monocular depth estimation as a denoising diffusion process. +It learns an iterative denoising process to `denoise' random depth distribution +into a depth map with the guidance of monocular visual conditions. The process +is performed in the latent space encoded by a dedicated depth encoder and +decoder. Instead of diffusing ground truth (GT) depth, the model learns to +reverse the process of diffusing the refined depth of itself into random depth +distribution. This self-diffusion formulation overcomes the difficulty of +applying generative models to sparse GT depth scenarios. The proposed approach +benefits this task by refining depth estimation step by step, which is superior +for generating accurate and highly detailed depth maps. Experimental results on +KITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion +approach could reach state-of-the-art performance in both indoor and outdoor +scenarios with acceptable inference time. + +
+
+
+
+
+ + ♻ ☆ 3D Human Pose Estimation via Intuitive Physics CVPR'23 + + +
+ Estimating 3D humans from images often produces implausible bodies that lean, +float, or penetrate the floor. Such methods ignore the fact that bodies are +typically supported by the scene. A physics engine can be used to enforce +physical plausibility, but these are not differentiable, rely on unrealistic +proxy bodies, and are difficult to integrate into existing optimization and +learning frameworks. In contrast, we exploit novel intuitive-physics (IP) terms +that can be inferred from a 3D SMPL body interacting with the scene. Inspired +by biomechanics, we infer the pressure heatmap on the body, the Center of +Pressure (CoP) from the heatmap, and the SMPL body's Center of Mass (CoM). With +these, we develop IPMAN, to estimate a 3D body from a color image in a "stable" +configuration by encouraging plausible floor contact and overlapping CoP and +CoM. Our IP terms are intuitive, easy to implement, fast to compute, +differentiable, and can be integrated into existing optimization and regression +methods. We evaluate IPMAN on standard datasets and MoYo, a new dataset with +synchronized multi-view images, ground-truth 3D bodies with complex poses, +body-floor contact, CoM and pressure. IPMAN produces more plausible results +than the state of the art, improving accuracy for static poses, while not +hurting dynamic ones. Code and data are available for research at +https://ipman.is.tue.mpg.de. + +
+
+ comment: Accepted in CVPR'23. Project page: https://ipman.is.tue.mpg.de +
+
+
+
+
+ + ♻ ☆ Text-guided Eyeglasses Manipulation with Spatial Constraints + + +
+ Virtual try-on of eyeglasses involves placing eyeglasses of different shapes +and styles onto a face image without physically trying them on. While existing +methods have shown impressive results, the variety of eyeglasses styles is +limited and the interactions are not always intuitive or efficient. To address +these limitations, we propose a Text-guided Eyeglasses Manipulation method that +allows for control of the eyeglasses shape and style based on a binary mask and +text, respectively. Specifically, we introduce a mask encoder to extract mask +conditions and a modulation module that enables simultaneous injection of text +and mask conditions. This design allows for fine-grained control of the +eyeglasses' appearance based on both textual descriptions and spatial +constraints. Our approach includes a disentangled mapper and a decoupling +strategy that preserves irrelevant areas, resulting in better local editing. We +employ a two-stage training scheme to handle the different convergence speeds +of the various modality conditions, successfully controlling both the shape and +style of eyeglasses. Extensive comparison experiments and ablation analyses +demonstrate the effectiveness of our approach in achieving diverse eyeglasses +styles while preserving irrelevant areas. + +
+
+ comment: Revised version: add some experiments +
+
+
+
+
+ + ♻ ☆ MatSpectNet: Material Segmentation Network with Domain-Aware and + Physically-Constrained Hyperspectral Reconstruction + + +
+ Achieving accurate material segmentation for 3-channel RGB images is +challenging due to the considerable variation in a material's appearance. +Hyperspectral images, which are sets of spectral measurements sampled at +multiple wavelengths, theoretically offer distinct information for material +identification, as variations in intensity of electromagnetic radiation +reflected by a surface depend on the material composition of a scene. However, +existing hyperspectral datasets are impoverished regarding the number of images +and material categories for the dense material segmentation task, and +collecting and annotating hyperspectral images with a spectral camera is +prohibitively expensive. To address this, we propose a new model, the +MatSpectNet to segment materials with recovered hyperspectral images from RGB +images. The network leverages the principles of colour perception in modern +cameras to constrain the reconstructed hyperspectral images and employs the +domain adaptation method to generalise the hyperspectral reconstruction +capability from a spectral recovery dataset to material segmentation datasets. +The reconstructed hyperspectral images are further filtered using learned +response curves and enhanced with human perception. The performance of +MatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces +dataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase +in average pixel accuracy and a 3.42% improvement in mean class accuracy +compared with the most recent publication. The project code is attached to the +supplementary material and will be published on GitHub. + +
+
+ comment: 7 pages main paper +
+
+
+
+
+ + ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for + Dynamic Imaging + + +
+ Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at +each time instant using its undersampled measurements. In particular, in the +case of dynamic tomography, only a single projection at a single view angle may +be available at a time, making the problem severely ill-posed. In this work, we +propose an approach, RED-PSM, which combines for the first time two powerful +techniques to address this challenging imaging problem. The first, are +partially separable models, which have been used to efficiently introduce a +low-rank prior for the spatio-temporal object. The second is the recent +Regularization by Denoising (RED), which provides a flexible framework to +exploit the impressive performance of state-of-the-art image denoising +algorithms, for various inverse problems. We propose a partially separable +objective with RED and a computationally efficient and scalable optimization +scheme with variable splitting and ADMM. Theoretical analysis proves the +convergence of our objective to a value corresponding to a stationary point +satisfying the first-order optimality conditions. Convergence is accelerated by +a particular projection-domain-based initialization. We demonstrate the +performance and computational improvements of our proposed RED-PSM with a +learned image denoiser by comparing it to a recent deep-prior-based method +known as TD-DIP. Although the main focus is on dynamic tomography, we also show +the performance advantages of RED-PSM in a cardiac dynamic MRI setting. + +
+
+
+
+
+ + ♻ ☆ Reconstruction-Aware Prior Distillation for Semi-supervised Point Cloud + Completion IJCAI 2023 + + +
+ Real-world sensors often produce incomplete, irregular, and noisy point +clouds, making point cloud completion increasingly important. However, most +existing completion methods rely on large paired datasets for training, which +is labor-intensive. This paper proposes RaPD, a novel semi-supervised point +cloud completion method that reduces the need for paired datasets. RaPD +utilizes a two-stage training scheme, where a deep semantic prior is learned in +stage 1 from unpaired complete and incomplete point clouds, and a +semi-supervised prior distillation process is introduced in stage 2 to train a +completion network using only a small number of paired samples. Additionally, a +self-supervised completion module is introduced to improve performance using +unpaired incomplete point clouds. Experiments on multiple datasets show that +RaPD outperforms previous methods in both homologous and heterologous +scenarios. + +
+
+ comment: Accepted to IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ Target-oriented Sentiment Classification with Sequential Cross-modal + Semantic Graph ICANN 2023 + + +
+ Multi-modal aspect-based sentiment classification (MABSC) is task of +classifying the sentiment of a target entity mentioned in a sentence and an +image. However, previous methods failed to account for the fine-grained +semantic association between the image and the text, which resulted in limited +identification of fine-grained image aspects and opinions. To address these +limitations, in this paper we propose a new approach called SeqCSG, which +enhances the encoder-decoder sentiment classification framework using +sequential cross-modal semantic graphs. SeqCSG utilizes image captions and +scene graphs to extract both global and local fine-grained image information +and considers them as elements of the cross-modal semantic graph along with +tokens from tweets. The sequential cross-modal semantic graph is represented as +a sequence with a multi-modal adjacency matrix indicating relationships between +elements. Experimental results show that the approach outperforms existing +methods and achieves state-of-the-art performance on two standard datasets. +Further analysis has demonstrated that the model can implicitly learn the +correlation between fine-grained information of the image and the text with the +given target. Our code is available at https://github.com/zjukg/SeqCSG. + +
+
+ comment: ICANN 2023, https://github.com/zjukg/SeqCSG +
+
+
+
+
+ + ♻ ☆ Deep Directly-Trained Spiking Neural Networks for Object Detection ICCV2023 + + +
+ Spiking neural networks (SNNs) are brain-inspired energy-efficient models +that encode information in spatiotemporal dynamics. Recently, deep SNNs trained +directly have shown great success in achieving high performance on +classification tasks with very few time steps. However, how to design a +directly-trained SNN for the regression task of object detection still remains +a challenging problem. To address this problem, we propose EMS-YOLO, a novel +directly-trained SNN framework for object detection, which is the first trial +to train a deep SNN with surrogate gradients for object detection rather than +ANN-SNN conversion strategies. Specifically, we design a full-spike residual +block, EMS-ResNet, which can effectively extend the depth of the +directly-trained SNN with low power consumption. Furthermore, we theoretically +analyze and prove the EMS-ResNet could avoid gradient vanishing or exploding. +The results demonstrate that our approach outperforms the state-of-the-art +ANN-SNN conversion methods (at least 500 time steps) in extremely fewer time +steps (only 4 time steps). It is shown that our model could achieve comparable +performance to the ANN with the same architecture while consuming 5.83 times +less energy on the frame-based COCO Dataset and the event-based Gen1 Dataset. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ A large calcium-imaging dataset reveals a systematic V4 organization for + natural scenes + + +
+ The visual system evolved to process natural scenes, yet most of our +understanding of the topology and function of visual cortex derives from +studies using artificial stimuli. To gain deeper insights into visual +processing of natural scenes, we utilized widefield calcium-imaging of primate +V4 in response to many natural images, generating a large dataset of +columnar-scale responses. We used this dataset to build a digital twin of V4 +via deep learning, generating a detailed topographical map of natural image +preferences at each cortical position. The map revealed clustered functional +domains for specific classes of natural image features. These ranged from +surface-related attributes like color and texture to shape-related features +such as edges, curvature, and facial features. We validated the model-predicted +domains with additional widefield calcium-imaging and single-cell resolution +two-photon imaging. Our study illuminates the detailed topological organization +and neural codes in V4 that represent natural scenes. + +
+
+ comment: 39 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation + Incorporating Gloss Information ACL 2023 + + +
+ Visual Word Sense Disambiguation (VWSD) is a task to find the image that most +accurately depicts the correct sense of the target word for the given context. +Previously, image-text matching models often suffered from recognizing +polysemous words. This paper introduces an unsupervised VWSD approach that uses +gloss information of an external lexical knowledge-base, especially the sense +definitions. Specifically, we suggest employing Bayesian inference to +incorporate the sense definitions when sense information of the answer is not +provided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we +propose a context-aware definition generation with GPT-3. Experimental results +show that the VWSD performance significantly increased with our Bayesian +inference-based approach. In addition, our context-aware definition generation +achieved prominent performance improvement in OOD examples exhibiting better +performance than the existing definition generation method. + +
+
+ comment: ACL 2023, https://aclanthology.org/2023.acl-long.88 +
+
+
+
+
+ + ♻ ☆ GaitRef: Gait Recognition with Refined Sequential Skeletons + + +
+ Identifying humans with their walking sequences, known as gait recognition, +is a useful biometric understanding task as it can be observed from a long +distance and does not require cooperation from the subject. Two common +modalities used for representing the walking sequence of a person are +silhouettes and joint skeletons. Silhouette sequences, which record the +boundary of the walking person in each frame, may suffer from the variant +appearances from carried-on objects and clothes of the person. Framewise joint +detections are noisy and introduce some jitters that are not consistent with +sequential detections. In this paper, we combine the silhouettes and skeletons +and refine the framewise joint predictions for gait recognition. With temporal +information from the silhouette sequences. We show that the refined skeletons +can improve gait recognition performance without extra annotations. We compare +our methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show +state-of-the-art performance. + +
+
+ comment: IJCB 2023. Code is available at + https://github.com/haidongz-usc/GaitRef +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ HeteFedRec: Federated Recommender Systems with Model Heterogeneity + + +
+ Owing to the nature of privacy protection, federated recommender systems +(FedRecs) have garnered increasing interest in the realm of on-device +recommender systems. However, most existing FedRecs only allow participating +clients to collaboratively train a recommendation model of the same public +parameter size. Training a model of the same size for all clients can lead to +suboptimal performance since clients possess varying resources. For example, +clients with limited training data may prefer to train a smaller recommendation +model to avoid excessive data consumption, while clients with sufficient data +would benefit from a larger model to achieve higher recommendation accuracy. To +address the above challenge, this paper introduces HeteFedRec, a novel FedRec +framework that enables the assignment of personalized model sizes to +participants. In HeteFedRec, we present a heterogeneous recommendation model +aggregation strategy, including a unified dual-task learning mechanism and a +dimensional decorrelation regularization, to allow knowledge aggregation among +recommender models of different sizes. Additionally, a relation-based ensemble +knowledge distillation method is proposed to effectively distil knowledge from +heterogeneous item embeddings. Extensive experiments conducted on three +real-world recommendation datasets demonstrate the effectiveness and efficiency +of HeteFedRec in training federated recommender systems under heterogeneous +settings. + +
+
+
+
+
+ + ☆ RRAML: Reinforced Retrieval Augmented Machine Learning + + +
+ The emergence of large language models (LLMs) has revolutionized machine +learning and related fields, showcasing remarkable abilities in comprehending, +generating, and manipulating human language. However, their conventional usage +through API-based text prompt submissions imposes certain limitations in terms +of context constraints and external source availability. To address these +challenges, we propose a novel framework called Reinforced Retrieval Augmented +Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs +with supporting information retrieved by a purpose-built retriever from a vast +user-provided database. By leveraging recent advancements in reinforcement +learning, our method effectively addresses several critical challenges. +Firstly, it circumvents the need for accessing LLM gradients. Secondly, our +method alleviates the burden of retraining LLMs for specific tasks, as it is +often impractical or impossible due to restricted access to the model and the +computational intensity involved. Additionally we seamlessly link the +retriever's task with the reasoner, mitigating hallucinations and reducing +irrelevant, and potentially damaging retrieved documents. We believe that the +research agenda outlined in this paper has the potential to profoundly impact +the field of AI, democratizing access to and utilization of LLMs for a wide +range of entities. + +
+
+
+
+
+ + ☆ Unbiased Delayed Feedback Label Correction for Conversion Rate + Prediction KDD 2023 + + +
+ Conversion rate prediction is critical to many online applications such as +digital display advertising. To capture dynamic data distribution, industrial +systems often require retraining models on recent data daily or weekly. +However, the delay of conversion behavior usually leads to incorrect labeling, +which is called delayed feedback problem. Existing work may fail to introduce +the correct information about false negative samples due to data sparsity and +dynamic data distribution. To directly introduce the correct feedback label +information, we propose an Unbiased delayed feedback Label Correction framework +(ULC), which uses an auxiliary model to correct labels for observed negative +feedback samples. Firstly, we theoretically prove that the label-corrected loss +is an unbiased estimate of the oracle loss using true labels. Then, as there +are no ready training data for label correction, counterfactual labeling is +used to construct artificial training data. Furthermore, since counterfactual +labeling utilizes only partial training data, we design an embedding-based +alternative training method to enhance performance. Comparative experiments on +both public and private datasets and detailed analyses show that our proposed +approach effectively alleviates the delayed feedback problem and consistently +outperforms the previous state-of-the-art methods. + +
+
+ comment: accepted by KDD 2023 +
+
+
+
+
+ + ☆ Self-refining of Pseudo Labels for Music Source Separation with Noisy + Labeled Data + + +
+ Music source separation (MSS) faces challenges due to the limited +availability of correctly-labeled individual instrument tracks. With the push +to acquire larger datasets to improve MSS performance, the inevitability of +encountering mislabeled individual instrument tracks becomes a significant +challenge to address. This paper introduces an automated technique for refining +the labels in a partially mislabeled dataset. Our proposed self-refining +technique, employed with a noisy-labeled dataset, results in only a 1% accuracy +degradation in multi-label instrument recognition compared to a classifier +trained on a clean-labeled dataset. The study demonstrates the importance of +refining noisy-labeled data in MSS model training and shows that utilizing the +refined dataset leads to comparable results derived from a clean-labeled +dataset. Notably, upon only access to a noisy dataset, MSS models trained on a +self-refined dataset even outperform those trained on a dataset refined with a +classifier trained on clean labels. + +
+
+ comment: 24th International Society for Music Information Retrieval Conference + (ISMIR 2023) +
+
+
+
+
+ + ☆ FaFCNN: A General Disease Classification Framework Based on Feature + Fusion Neural Networks + + +
+ There are two fundamental problems in applying deep learning/machine learning +methods to disease classification tasks, one is the insufficient number and +poor quality of training samples; another one is how to effectively fuse +multiple source features and thus train robust classification models. To +address these problems, inspired by the process of human learning knowledge, we +propose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which +introduces a feature-aware interaction module and a feature alignment module +based on domain adversarial learning. This is a general framework for disease +classification, and FaFCNN improves the way existing methods obtain sample +correlation features. The experimental results show that training using +augmented features obtained by pre-training gradient boosting decision tree +yields more performance gains than random-forest based methods. On the +low-quality dataset with a large amount of missing data in our setup, FaFCNN +obtains a consistently optimal performance compared to competitive baselines. +In addition, extensive experiments demonstrate the robustness of the proposed +method and the effectiveness of each component of the model\footnote{Accepted +in IEEE SMC2023}. + +
+
+
+
+
+ + ♻ ☆ PubMed and Beyond: Recent Advances and Best Practices in Biomedical + Literature Search + + +
+ Biomedical research yields a wealth of information, much of which is only +accessible through the literature. Consequently, literature search is an +essential tool for building on prior knowledge in clinical and biomedical +research. Although recent improvements in artificial intelligence have expanded +functionality beyond keyword-based search, these advances may be unfamiliar to +clinicians and researchers. In response, we present a survey of literature +search tools tailored to both general and specific information needs in +biomedicine, with the objective of helping readers efficiently fulfill their +information needs. We first examine the widely used PubMed search engine, +discussing recent improvements and continued challenges. We then describe +literature search tools catering to five specific information needs: 1. +Identifying high-quality clinical research for evidence-based medicine. 2. +Retrieving gene-related information for precision medicine and genomics. 3. +Searching by meaning, including natural language questions. 4. Locating related +articles with literature recommendation. 5. Mining literature to discover +associations between concepts such as diseases and genetic variants. +Additionally, we cover practical considerations and best practices for choosing +and using these tools. Finally, we provide a perspective on the future of +literature search engines, considering recent breakthroughs in large language +models such as ChatGPT. In summary, our survey provides a comprehensive view of +biomedical literature search functionalities with 36 publicly available tools. + +
+
+ comment: 27 pages, 6 figures, 36 tools +
+
+
+
+
+ + ♻ ☆ Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques + + +
+ In the contemporary digital landscape, online reviews have become an +indispensable tool for promoting products and services across various +businesses. Marketers, advertisers, and online businesses have found incentives +to create deceptive positive reviews for their products and negative reviews +for their competitors' offerings. As a result, the writing of deceptive reviews +has become an unavoidable practice for businesses seeking to promote themselves +or undermine their rivals. Detecting such deceptive reviews has become an +intense and ongoing area of research. This research paper proposes a machine +learning model to identify deceptive reviews, with a particular focus on +restaurants. This study delves into the performance of numerous experiments +conducted on a dataset of restaurant reviews known as the Deceptive Opinion +Spam Corpus. To accomplish this, an n-gram model and max features are developed +to effectively identify deceptive content, particularly focusing on fake +reviews. A benchmark study is undertaken to explore the performance of two +different feature extraction techniques, which are then coupled with five +distinct machine learning classification algorithms. The experimental results +reveal that the passive aggressive classifier stands out among the various +algorithms, showcasing the highest accuracy not only in text classification but +also in identifying fake reviews. Moreover, the research delves into data +augmentation and implements various deep learning techniques to further enhance +the process of detecting deceptive reviews. The findings shed light on the +efficacy of the proposed machine learning approach and offer valuable insights +into dealing with deceptive reviews in the realm of online businesses. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+
+
+
+ + Machine Learning 108 + +
+
+
+ + ☆ Parallel $Q$-Learning: Scaling Off-policy Reinforcement Learning under + Massively Parallel Simulation ICML 2023 + + +
+ Reinforcement learning is time-consuming for complex tasks due to the need +for large amounts of training data. Recent advances in GPU-based simulation, +such as Isaac Gym, have sped up data collection thousands of times on a +commodity GPU. Most prior works used on-policy methods like PPO due to their +simplicity and ease of scaling. Off-policy methods are more data efficient but +challenging to scale, resulting in a longer wall-clock training time. This +paper presents a Parallel $Q$-Learning (PQL) scheme that outperforms PPO in +wall-clock time while maintaining superior sample efficiency of off-policy +learning. PQL achieves this by parallelizing data collection, policy learning, +and value learning. Different from prior works on distributed off-policy +learning, such as Apex, our scheme is designed specifically for massively +parallel GPU-based simulation and optimized to work on a single workstation. In +experiments, we demonstrate that $Q$-learning can be scaled to \textit{tens of +thousands of parallel environments} and investigate important factors affecting +learning speed. The code is available at https://github.com/Improbable-AI/pql. + +
+
+ comment: Accepted by ICML 2023 +
+
+
+
+
+ + ☆ 3D-LLM: Injecting the 3D World into Large Language Models + + +
+ Large language models (LLMs) and Vision-Language Models (VLMs) have been +proven to excel at multiple tasks, such as commonsense reasoning. Powerful as +these models can be, they are not grounded in the 3D physical world, which +involves richer concepts such as spatial relationships, affordances, physics, +layout, and so on. In this work, we propose to inject the 3D world into large +language models and introduce a whole new family of 3D-LLMs. Specifically, +3D-LLMs can take 3D point clouds and their features as input and perform a +diverse set of 3D-related tasks, including captioning, dense captioning, 3D +question answering, task decomposition, 3D grounding, 3D-assisted dialog, +navigation, and so on. Using three types of prompting mechanisms that we +design, we are able to collect over 300k 3D-language data covering these tasks. +To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that +obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as +our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism, +3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show +that our model outperforms state-of-the-art baselines by a large margin (e.g., +the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore, +experiments on our held-in datasets for 3D captioning, task composition, and +3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative +examples also show that our model could perform more tasks beyond the scope of +existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/. + +
+
+ comment: Project Page: : https://vis-www.cs.umass.edu/3dllm/ +
+
+
+
+
+ + ☆ An Isometric Stochastic Optimizer + + +
+ The Adam optimizer is the standard choice in deep learning applications. I +propose a simple explanation of Adam's success: it makes each parameter's step +size independent of the norms of the other parameters. Based on this principle +I derive Iso, a new optimizer which makes the norm of a parameter's update +invariant to the application of any linear transformation to its inputs and +outputs. I develop a variant of Iso called IsoAdam that allows optimal +hyperparameters to be transferred from Adam, and demonstrate that IsoAdam +obtains a speedup over Adam when training a small Transformer. + +
+
+
+
+
+ + ☆ Provable Benefits of Policy Learning from Human Preferences in + Contextual Bandit Problems + + +
+ A crucial task in decision-making problems is reward engineering. It is +common in practice that no obvious choice of reward function exists. Thus, a +popular approach is to introduce human feedback during training and leverage +such feedback to learn a reward function. Among all policy learning methods +that use human feedback, preference-based methods have demonstrated substantial +success in recent empirical applications such as InstructGPT. In this work, we +develop a theory that provably shows the benefits of preference-based methods +in offline contextual bandits. In particular, we improve the modeling and +suboptimality analysis for running policy learning methods on human-scored +samples directly. Then, we compare it with the suboptimality guarantees of +preference-based methods and show that preference-based methods enjoy lower +suboptimality. + +
+
+
+
+
+ + ☆ Big Data - Supply Chain Management Framework for Forecasting: Data + Preprocessing and Machine Learning Techniques + + +
+ This article intends to systematically identify and comparatively analyze +state-of-the-art supply chain (SC) forecasting strategies and technologies. A +novel framework has been proposed incorporating Big Data Analytics in SC +Management (problem identification, data sources, exploratory data analysis, +machine-learning model training, hyperparameter tuning, performance evaluation, +and optimization), forecasting effects on human-workforce, inventory, and +overall SC. Initially, the need to collect data according to SC strategy and +how to collect them has been discussed. The article discusses the need for +different types of forecasting according to the period or SC objective. The SC +KPIs and the error-measurement systems have been recommended to optimize the +top-performing model. The adverse effects of phantom inventory on forecasting +and the dependence of managerial decisions on the SC KPIs for determining model +performance parameters and improving operations management, transparency, and +planning efficiency have been illustrated. The cyclic connection within the +framework introduces preprocessing optimization based on the post-process KPIs, +optimizing the overall control process (inventory management, workforce +determination, cost, production and capacity planning). The contribution of +this research lies in the standard SC process framework proposal, recommended +forecasting data analysis, forecasting effects on SC performance, machine +learning algorithms optimization followed, and in shedding light on future +research. + +
+
+
+
+
+ + ☆ A Connection between One-Step Regularization and Critic Regularization + in Reinforcement Learning ICML 2023 + + +
+ As with any machine learning problem with limited data, effective offline RL +algorithms require careful regularization to avoid overfitting. One-step +methods perform regularization by doing just a single step of policy +improvement, while critic regularization methods do many steps of policy +improvement with a regularized objective. These methods appear distinct. +One-step methods, such as advantage-weighted regression and conditional +behavioral cloning, truncate policy iteration after just one step. This ``early +stopping'' makes one-step RL simple and stable, but can limit its asymptotic +performance. Critic regularization typically requires more compute but has +appealing lower-bound guarantees. In this paper, we draw a close connection +between these methods: applying a multi-step critic regularization method with +a regularization coefficient of 1 yields the same policy as one-step RL. While +practical implementations violate our assumptions and critic regularization is +typically applied with smaller regularization coefficients, our experiments +nevertheless show that our analysis makes accurate, testable predictions about +practical offline RL methods (CQL and one-step RL) with commonly-used +hyperparameters. Our results that every problem can be solved with a single +step of policy improvement, but rather that one-step RL might be competitive +with critic regularization on RL problems that demand strong regularization. + +
+
+ comment: Accepted to ICML 2023. Video + (https://www.youtube.com/watch?v=1xlixIHZ0R4) and code + (https://github.com/ben-eysenbach/ac-connection) +
+
+
+
+
+ + ☆ Learning Dense Correspondences between Photos and Sketches ICML 2023 + + +
+ Humans effortlessly grasp the connection between sketches and real-world +objects, even when these sketches are far from realistic. Moreover, human +sketch understanding goes beyond categorization -- critically, it also entails +understanding how individual elements within a sketch correspond to parts of +the physical world it represents. What are the computational ingredients needed +to support this ability? Towards answering this question, we make two +contributions: first, we introduce a new sketch-photo correspondence benchmark, +$\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across +125 object categories, augmenting the existing Sketchy dataset with +fine-grained correspondence metadata. Second, we propose a self-supervised +method for learning dense correspondences between sketch-photo pairs, building +upon recent advances in correspondence learning for pairs of photos. Our model +uses a spatial transformer network to estimate the warp flow between latent +representations of a sketch and photo extracted by a contrastive learning-based +ConvNet backbone. We found that this approach outperformed several strong +baselines and produced predictions that were quantitatively consistent with +other warp-based methods. However, our benchmark also revealed systematic +differences between predictions of the suite of models we tested and those of +humans. Taken together, our work suggests a promising path towards developing +artificial systems that achieve more human-like understanding of visual images +at different levels of abstraction. Project page: +https://photo-sketch-correspondence.github.io + +
+
+ comment: Accepted to ICML 2023. Project page: + https://photo-sketch-correspondence.github.io +
+
+
+
+
+ + ☆ Efficiently Sampling the PSD Cone with the Metric Dikin Walk + + +
+ Semi-definite programs represent a frontier of efficient computation. While +there has been much progress on semi-definite optimization, with moderate-sized +instances currently solvable in practice by the interior-point method, the +basic problem of sampling semi-definite solutions remains a formidable +challenge. The direct application of known polynomial-time algorithms for +sampling general convex bodies to semi-definite sampling leads to a +prohibitively high running time. In addition, known general methods require an +expensive rounding phase as pre-processing. Here we analyze the Dikin walk, by +first adapting it to general metrics, then devising suitable metrics for the +PSD cone with affine constraints. The resulting mixing time and per-step +complexity are considerably smaller, and by an appropriate choice of the +metric, the dependence on the number of constraints can be made +polylogarithmic. We introduce a refined notion of self-concordant matrix +functions and give rules for combining different metrics. Along the way, we +further develop the theory of interior-point methods for sampling. + +
+
+ comment: 54 pages +
+
+
+
+
+ + ☆ On Privileged and Convergent Bases in Neural Network Representations ICML 2023 + + +
+ In this study, we investigate whether the representations learned by neural +networks possess a privileged and convergent basis. Specifically, we examine +the significance of feature directions represented by individual neurons. +First, we establish that arbitrary rotations of neural representations cannot +be inverted (unlike linear networks), indicating that they do not exhibit +complete rotational invariance. Subsequently, we explore the possibility of +multiple bases achieving identical performance. To do this, we compare the +bases of networks trained with the same parameters but with varying random +initializations. Our study reveals two findings: (1) Even in wide networks such +as WideResNets, neural networks do not converge to a unique basis; (2) Basis +correlation increases significantly when a few early layers of the network are +frozen identically. + Furthermore, we analyze Linear Mode Connectivity, which has been studied as a +measure of basis correlation. Our findings give evidence that while Linear Mode +Connectivity improves with increased network width, this improvement is not due +to an increase in basis correlation. + +
+
+ comment: In the Workshop on High-dimensional Learning Dynamics at ICML 2023 +
+
+
+
+
+ + ☆ Contextual Bandits and Imitation Learning via Preference-Based Active + Queries + + +
+ We consider the problem of contextual bandits and imitation learning, where +the learner lacks direct knowledge of the executed action's reward. Instead, +the learner can actively query an expert at each round to compare two actions +and receive noisy preference feedback. The learner's objective is two-fold: to +minimize the regret associated with the executed actions, while simultaneously, +minimizing the number of comparison queries made to the expert. In this paper, +we assume that the learner has access to a function class that can represent +the expert's preference model under appropriate link functions, and provide an +algorithm that leverages an online regression oracle with respect to this +function class for choosing its actions and deciding when to query. For the +contextual bandit setting, our algorithm achieves a regret bound that combines +the best of both worlds, scaling as $O(\min\{\sqrt{T}, d/\Delta\})$, where $T$ +represents the number of interactions, $d$ represents the eluder dimension of +the function class, and $\Delta$ represents the minimum preference of the +optimal action over any suboptimal action under all contexts. Our algorithm +does not require the knowledge of $\Delta$, and the obtained regret bound is +comparable to what can be achieved in the standard contextual bandits setting +where the learner observes reward signals at each round. Additionally, our +algorithm makes only $O(\min\{T, d^2/\Delta^2\})$ queries to the expert. We +then extend our algorithm to the imitation learning setting, where the learning +agent engages with an unknown environment in episodes of length $H$ each, and +provide similar guarantees for regret and query complexity. Interestingly, our +algorithm for imitation learning can even learn to outperform the underlying +expert, when it is suboptimal, highlighting a practical benefit of +preference-based feedback in imitation learning. + +
+
+
+
+
+ + ☆ QAmplifyNet: Pushing the Boundaries of Supply Chain Backorder Prediction + Using Interpretable Hybrid Quantum - Classical Neural Network + + +
+ Supply chain management relies on accurate backorder prediction for +optimizing inventory control, reducing costs, and enhancing customer +satisfaction. However, traditional machine-learning models struggle with +large-scale datasets and complex relationships, hindering real-world data +collection. This research introduces a novel methodological framework for +supply chain backorder prediction, addressing the challenge of handling large +datasets. Our proposed model, QAmplifyNet, employs quantum-inspired techniques +within a quantum-classical neural network to predict backorders effectively on +short and imbalanced datasets. Experimental evaluations on a benchmark dataset +demonstrate QAmplifyNet's superiority over classical models, quantum ensembles, +quantum neural networks, and deep reinforcement learning. Its proficiency in +handling short, imbalanced datasets makes it an ideal solution for supply chain +management. To enhance model interpretability, we use Explainable Artificial +Intelligence techniques. Practical implications include improved inventory +control, reduced backorders, and enhanced operational efficiency. QAmplifyNet +seamlessly integrates into real-world supply chain management systems, enabling +proactive decision-making and efficient resource allocation. Future work +involves exploring additional quantum-inspired techniques, expanding the +dataset, and investigating other supply chain applications. This research +unlocks the potential of quantum computing in supply chain optimization and +paves the way for further exploration of quantum-inspired machine learning +models in supply chain management. Our framework and QAmplifyNet model offer a +breakthrough approach to supply chain backorder prediction, providing superior +performance and opening new avenues for leveraging quantum-inspired techniques +in supply chain management. + +
+
+
+
+
+ + ☆ Universal Approximation Theorem and error bounds for quantum neural + networks and quantum reservoirs + + +
+ Universal approximation theorems are the foundations of classical neural +networks, providing theoretical guarantees that the latter are able to +approximate maps of interest. Recent results have shown that this can also be +achieved in a quantum setting, whereby classical functions can be approximated +by parameterised quantum circuits. We provide here precise error bounds for +specific classes of functions and extend these results to the interesting new +setup of randomised quantum circuits, mimicking classical reservoir neural +networks. Our results show in particular that a quantum neural network with +$\mathcal{O}(\varepsilon^{-2})$ weights and $\mathcal{O} (\lceil +\log_2(\varepsilon^{-1}) \rceil)$ qubits suffices to achieve accuracy +$\varepsilon>0$ when approximating functions with integrable Fourier transform. + +
+
+ comment: 20 pages, 0 figure +
+
+
+
+
+ + ☆ Anytime Model Selection in Linear Bandits + + +
+ Model selection in the context of bandit optimization is a challenging +problem, as it requires balancing exploration and exploitation not only for +action selection, but also for model selection. One natural approach is to rely +on online learning algorithms that treat different models as experts. Existing +methods, however, scale poorly ($\text{poly}M$) with the number of models $M$ +in terms of their regret. Our key insight is that, for model selection in +linear bandits, we can emulate full-information feedback to the online learner +with a favorable bias-variance trade-off. This allows us to develop ALEXP, +which has an exponentially improved ($\log M$) dependence on $M$ for its +regret. ALEXP has anytime guarantees on its regret, and neither requires +knowledge of the horizon $n$, nor relies on an initial purely exploratory +stage. Our approach utilizes a novel time-uniform analysis of the Lasso, +establishing a new connection between online learning and high-dimensional +statistics. + +
+
+ comment: 37 pages, 7 figures +
+
+
+
+
+ + ☆ A Statistical View of Column Subset Selection + + +
+ We consider the problem of selecting a small subset of representative +variables from a large dataset. In the computer science literature, this +dimensionality reduction problem is typically formalized as Column Subset +Selection (CSS). Meanwhile, the typical statistical formalization is to find an +information-maximizing set of Principal Variables. This paper shows that these +two approaches are equivalent, and moreover, both can be viewed as maximum +likelihood estimation within a certain semi-parametric model. Using these +connections, we show how to efficiently (1) perform CSS using only summary +statistics from the original dataset; (2) perform CSS in the presence of +missing and/or censored data; and (3) select the subset size for CSS in a +hypothesis testing framework. + +
+
+
+
+
+ + ☆ Data-free Black-box Attack based on Diffusion Model + + +
+ Since the training data for the target model in a data-free black-box attack +is not available, most recent schemes utilize GANs to generate data for +training substitute model. However, these GANs-based schemes suffer from low +training efficiency as the generator needs to be retrained for each target +model during the substitute training process, as well as low generation +quality. To overcome these limitations, we consider utilizing the diffusion +model to generate data, and propose a data-free black-box attack scheme based +on diffusion model to improve the efficiency and accuracy of substitute +training. Despite the data generated by the diffusion model exhibits high +quality, it presents diverse domain distributions and contains many samples +that do not meet the discriminative criteria of the target model. To further +facilitate the diffusion model to generate data suitable for the target model, +we propose a Latent Code Augmentation (LCA) method to guide the diffusion model +in generating data. With the guidance of LCA, the data generated by the +diffusion model not only meets the discriminative criteria of the target model +but also exhibits high diversity. By utilizing this data, it is possible to +train substitute model that closely resemble the target model more efficiently. +Extensive experiments demonstrate that our LCA achieves higher attack success +rates and requires fewer query budgets compared to GANs-based schemes for +different target models. + +
+
+
+
+
+ + ☆ Stochastic Step-wise Feature Selection for Exponential Random Graph + Models (ERGMs) + + +
+ Statistical analysis of social networks provides valuable insights into +complex network interactions across various scientific disciplines. However, +accurate modeling of networks remains challenging due to the heavy +computational burden and the need to account for observed network dependencies. +Exponential Random Graph Models (ERGMs) have emerged as a promising technique +used in social network modeling to capture network dependencies by +incorporating endogenous variables. Nevertheless, using ERGMs poses multiple +challenges, including the occurrence of ERGM degeneracy, which generates +unrealistic and meaningless network structures. To address these challenges and +enhance the modeling of collaboration networks, we propose and test a novel +approach that focuses on endogenous variable selection within ERGMs. Our method +aims to overcome the computational burden and improve the accommodation of +observed network dependencies, thereby facilitating more accurate and +meaningful interpretations of network phenomena in various scientific fields. +We conduct empirical testing and rigorous analysis to contribute to the +advancement of statistical techniques and offer practical insights for network +analysis. + +
+
+ comment: 23 pages, 6 tables and 18 figures +
+
+
+
+
+ + ☆ A Real-World WebAgent with Planning, Long Context Understanding, and + Program Synthesis + + +
+ Pre-trained large language models (LLMs) have recently achieved better +generalization and sample efficiency in autonomous web navigation. However, the +performance on real-world websites has still suffered from (1) open domainness, +(2) limited context length, and (3) lack of inductive bias on HTML. We +introduce WebAgent, an LLM-driven agent that can complete the tasks on real +websites following natural language instructions. WebAgent plans ahead by +decomposing instructions into canonical sub-instructions, summarizes long HTML +documents into task-relevant snippets, and acts on websites via generated +Python programs from those. We design WebAgent with Flan-U-PaLM, for grounded +code generation, and HTML-T5, new pre-trained LLMs for long HTML documents +using local and global attention mechanisms and a mixture of long-span +denoising objectives, for planning and summarization. We empirically +demonstrate that our recipe improves the success on a real website by over 50%, +and that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9% +higher success rate than prior SoTA on the MiniWoB web navigation benchmark and +better accuracy on offline task planning evaluation. + +
+
+
+
+
+ + ☆ Early Neuron Alignment in Two-layer ReLU Networks with Small + Initialization + + +
+ This paper studies the problem of training a two-layer ReLU network for +binary classification using gradient flow with small initialization. We +consider a training dataset with well-separated input vectors: Any pair of +input data with the same label are positively correlated, and any pair with +different labels are negatively correlated. Our analysis shows that, during the +early phase of training, neurons in the first layer try to align with either +the positive data or the negative data, depending on its corresponding weight +on the second layer. A careful analysis of the neurons' directional dynamics +allows us to provide an $\mathcal{O}(\frac{\log n}{\sqrt{\mu}})$ upper bound on +the time it takes for all neurons to achieve good alignment with the input +data, where $n$ is the number of data points and $\mu$ measures how well the +data are separated. After the early alignment phase, the loss converges to zero +at a $\mathcal{O}(\frac{1}{t})$ rate, and the weight matrix on the first layer +is approximately low-rank. Numerical experiments on the MNIST dataset +illustrate our theoretical findings. + +
+
+
+
+
+ + ☆ Efficiently Learning One-Hidden-Layer ReLU Networks via Schur + Polynomials + + +
+ We study the problem of PAC learning a linear combination of $k$ ReLU +activations under the standard Gaussian distribution on $\mathbb{R}^d$ with +respect to the square loss. Our main result is an efficient algorithm for this +learning task with sample and computational complexity $(dk/\epsilon)^{O(k)}$, +where $\epsilon>0$ is the target accuracy. Prior work had given an algorithm +for this problem with complexity $(dk/\epsilon)^{h(k)}$, where the function +$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our +algorithm is near-optimal within the class of Correlational Statistical Query +algorithms. At a high-level, our algorithm uses tensor decomposition to +identify a subspace such that all the $O(k)$-order moments are small in the +orthogonal directions. Its analysis makes essential use of the theory of Schur +polynomials to show that the higher-moment error tensors are small given that +the lower-order ones are. + +
+
+
+
+
+ + ☆ Learning Provably Robust Estimators for Inverse Problems via Jittering + + +
+ Deep neural networks provide excellent performance for inverse problems such +as denoising. However, neural networks can be sensitive to adversarial or +worst-case perturbations. This raises the question of whether such networks can +be trained efficiently to be worst-case robust. In this paper, we investigate +whether jittering, a simple regularization technique that adds isotropic +Gaussian noise during training, is effective for learning worst-case robust +estimators for inverse problems. While well studied for prediction in +classification tasks, the effectiveness of jittering for inverse problems has +not been systematically investigated. In this paper, we present a novel +analytical characterization of the optimal $\ell_2$-worst-case robust estimator +for linear denoising and show that jittering yields optimal robust denoisers. +Furthermore, we examine jittering empirically via training deep neural networks +(U-nets) for natural image denoising, deconvolution, and accelerated magnetic +resonance imaging (MRI). The results show that jittering significantly enhances +the worst-case robustness, but can be suboptimal for inverse problems beyond +denoising. Moreover, our results imply that training on real data which often +contains slight noise is somewhat robustness enhancing. + +
+
+
+
+
+ + ☆ Causal Fair Machine Learning via Rank-Preserving Interventional + Distributions + + +
+ A decision can be defined as fair if equal individuals are treated equally +and unequals unequally. Adopting this definition, the task of designing machine +learning models that mitigate unfairness in automated decision-making systems +must include causal thinking when introducing protected attributes. Following a +recent proposal, we define individuals as being normatively equal if they are +equal in a fictitious, normatively desired (FiND) world, where the protected +attribute has no (direct or indirect) causal effect on the target. We propose +rank-preserving interventional distributions to define an estimand of this FiND +world and a warping method for estimation. Evaluation criteria for both the +method and resulting model are presented and validated through simulations and +empirical data. With this, we show that our warping approach effectively +identifies the most discriminated individuals and mitigates unfairness. + +
+
+
+
+
+ + ☆ Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution + for Medical Image Classification + + +
+ Graph-based neural network models are gaining traction in the field of +representation learning due to their ability to uncover latent topological +relationships between entities that are otherwise challenging to identify. +These models have been employed across a diverse range of domains, encompassing +drug discovery, protein interactions, semantic segmentation, and fluid dynamics +research. In this study, we investigate the potential of Graph Neural Networks +(GNNs) for medical image classification. We introduce a novel model that +combines GNNs and edge convolution, leveraging the interconnectedness of RGB +channel feature values to strongly represent connections between crucial graph +nodes. Our proposed model not only performs on par with state-of-the-art Deep +Neural Networks (DNNs) but does so with 1000 times fewer parameters, resulting +in reduced training time and data requirements. We compare our Graph +Convolutional Neural Network (GCNN) to pre-trained DNNs for classifying +MedMNIST dataset classes, revealing promising prospects for GNNs in medical +image analysis. Our results also encourage further exploration of advanced +graph-based models such as Graph Attention Networks (GAT) and Graph +Auto-Encoders in the medical imaging domain. The proposed model yields more +reliable, interpretable, and accurate outcomes for tasks like semantic +segmentation and image classification compared to simpler GCNNs + +
+
+
+
+
+ + ☆ Analyzing the Strategy of Propaganda using Inverse Reinforcement + Learning: Evidence from the 2022 Russian Invasion of Ukraine + + +
+ The 2022 Russian invasion of Ukraine was accompanied by a large-scale, +pro-Russian propaganda campaign on social media. However, the strategy behind +the dissemination of propaganda has remained unclear, particularly how the +online discourse was strategically shaped by the propagandists' community. +Here, we analyze the strategy of the Twitter community using an inverse +reinforcement learning (IRL) approach. Specifically, IRL allows us to model +online behavior as a Markov decision process, where the goal is to infer the +underlying reward structure that guides propagandists when interacting with +users with a supporting or opposing stance toward the invasion. Thereby, we aim +to understand empirically whether and how between-user interactions are +strategically used to promote the proliferation of Russian propaganda. For +this, we leverage a large-scale dataset with 349,455 posts with pro-Russian +propaganda from 132,131 users. We show that bots and humans follow a different +strategy: bots respond predominantly to pro-invasion messages, suggesting that +they seek to drive virality; while messages indicating opposition primarily +elicit responses from humans, suggesting that they tend to engage in critical +discussions. To the best of our knowledge, this is the first study analyzing +the strategy behind propaganda from the 2022 Russian invasion of Ukraine +through the lens of IRL. + +
+
+
+
+
+ + ☆ Is attention all you need in medical image analysis? A review + + +
+ Medical imaging is a key component in clinical diagnosis, treatment planning +and clinical trial design, accounting for almost 90% of all healthcare data. +CNNs achieved performance gains in medical image analysis (MIA) over the last +years. CNNs can efficiently model local pixel interactions and be trained on +small-scale MI data. The main disadvantage of typical CNN models is that they +ignore global pixel relationships within images, which limits their +generalisation ability to understand out-of-distribution data with different +'global' information. The recent progress of Artificial Intelligence gave rise +to Transformers, which can learn global relationships from data. However, full +Transformer models need to be trained on large-scale data and involve +tremendous computational complexity. Attention and Transformer compartments +(Transf/Attention) which can well maintain properties for modelling global +relationships, have been proposed as lighter alternatives of full Transformers. +Recently, there is an increasing trend to co-pollinate complementary +local-global properties from CNN and Transf/Attention architectures, which led +to a new era of hybrid models. The past years have witnessed substantial growth +in hybrid CNN-Transf/Attention models across diverse MIA problems. In this +systematic review, we survey existing hybrid CNN-Transf/Attention models, +review and unravel key architectural designs, analyse breakthroughs, and +evaluate current and future opportunities as well as challenges. We also +introduced a comprehensive analysis framework on generalisation opportunities +of scientific and clinical impact, based on which new data-driven domain +generalisation and adaptation methods can be stimulated. + +
+
+
+
+
+ + ☆ Detecting disturbances in network-coupled dynamical systems with machine + learning + + +
+ Identifying disturbances in network-coupled dynamical systems without +knowledge of the disturbances or underlying dynamics is a problem with a wide +range of applications. For example, one might want to know which nodes in the +network are being disturbed and identify the type of disturbance. Here we +present a model-free method based on machine learning to identify such unknown +disturbances based only on prior observations of the system when forced by a +known training function. We find that this method is able to identify the +locations and properties of many different types of unknown disturbances using +a variety of known forcing functions. We illustrate our results both with +linear and nonlinear disturbances using food web and neuronal activity models. +Finally, we discuss how to scale our method to large networks. + +
+
+
+
+
+ + ☆ Nonparametric Linear Feature Learning in Regression Through + Regularisation + + +
+ Representation learning plays a crucial role in automated feature selection, +particularly in the context of high-dimensional data, where non-parametric +methods often struggle. In this study, we focus on supervised learning +scenarios where the pertinent information resides within a lower-dimensional +linear subspace of the data, namely the multi-index model. If this subspace +were known, it would greatly enhance prediction, computation, and +interpretation. To address this challenge, we propose a novel method for linear +feature learning with non-parametric prediction, which simultaneously estimates +the prediction function and the linear subspace. Our approach employs empirical +risk minimisation, augmented with a penalty on function derivatives, ensuring +versatility. Leveraging the orthogonality and rotation invariance properties of +Hermite polynomials, we introduce our estimator, named RegFeaL. By utilising +alternative minimisation, we iteratively rotate the data to improve alignment +with leading directions and accurately estimate the relevant dimension in +practical settings. We establish that our method yields a consistent estimator +of the prediction function with explicit rates. Additionally, we provide +empirical results demonstrating the performance of RegFeaL in various +experiments. + +
+
+ comment: 43 pages, 16 figures +
+
+
+
+
+ + ☆ Concept-based explainability for an EEG transformer model + + +
+ Deep learning models are complex due to their size, structure, and inherent +randomness in training procedures. Additional complexity arises from the +selection of datasets and inductive biases. Addressing these challenges for +explainability, Kim et al. (2018) introduced Concept Activation Vectors (CAVs), +which aim to understand deep models' internal states in terms of human-aligned +concepts. These concepts correspond to directions in latent space, identified +using linear discriminants. Although this method was first applied to image +classification, it was later adapted to other domains, including natural +language processing. In this work, we attempt to apply the method to +electroencephalogram (EEG) data for explainability in Kostas et al.'s BENDR +(2021), a large-scale transformer model. A crucial part of this endeavor +involves defining the explanatory concepts and selecting relevant datasets to +ground concepts in the latent space. Our focus is on two mechanisms for EEG +concept formation: the use of externally labeled EEG datasets, and the +application of anatomically defined concepts. The former approach is a +straightforward generalization of methods used in image classification, while +the latter is novel and specific to EEG. We present evidence that both +approaches to concept formation yield valuable insights into the +representations learned by deep EEG models. + +
+
+ comment: To appear in proceedings of 2023 IEEE International workshop on + Machine Learning for Signal Processing +
+
+
+
+
+ + ☆ Safety Performance of Neural Networks in the Presence of Covariate Shift + + +
+ Covariate shift may impact the operational safety performance of neural +networks. A re-evaluation of the safety performance, however, requires +collecting new operational data and creating corresponding ground truth labels, +which often is not possible during operation. We are therefore proposing to +reshape the initial test set, as used for the safety performance evaluation +prior to deployment, based on an approximation of the operational data. This +approximation is obtained by observing and learning the distribution of +activation patterns of neurons in the network during operation. The reshaped +test set reflects the distribution of neuron activation values as observed +during operation, and may therefore be used for re-evaluating safety +performance in the presence of covariate shift. First, we derive conservative +bounds on the values of neurons by applying finite binning and static dataflow +analysis. Second, we formulate a mixed integer linear programming (MILP) +constraint for constructing the minimum set of data points to be removed in the +test set, such that the difference between the discretized test and operational +distributions is bounded. We discuss potential benefits and limitations of this +constraint-based approach based on our initial experience with an implemented +research prototype. + +
+
+
+
+
+ + ☆ Policy Gradient Optimal Correlation Search for Variance Reduction in + Monte Carlo simulation and Maximum Optimal Transport + + +
+ We propose a new algorithm for variance reduction when estimating $f(X_T)$ +where $X$ is the solution to some stochastic differential equation and $f$ is a +test function. The new estimator is $(f(X^1_T) + f(X^2_T))/2$, where $X^1$ and +$X^2$ have same marginal law as $X$ but are pathwise correlated so that to +reduce the variance. The optimal correlation function $\rho$ is approximated by +a deep neural network and is calibrated along the trajectories of $(X^1, X^2)$ +by policy gradient and reinforcement learning techniques. Finding an optimal +coupling given marginal laws has links with maximum optimal transport. + +
+
+ comment: 7 pages +
+
+
+
+
+ + MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised + Learning of Motion and Content Features + + +
+ Self-supervised learning of visual representations has been focusing on +learning content features, which do not capture object motion or location, and +focus on identifying and differentiating objects in images and videos. On the +other hand, optical flow estimation is a task that does not involve +understanding the content of the images on which it is estimated. We unify the +two approaches and introduce MC-JEPA, a joint-embedding predictive architecture +and self-supervised learning approach to jointly learn optical flow and content +features within a shared encoder, demonstrating that the two associated +objectives; the optical flow estimation objective and the self-supervised +learning objective; benefit from each other and thus learn content features +that incorporate motion information. The proposed approach achieves performance +on-par with existing unsupervised optical flow benchmarks, as well as with +common self-supervised learning approaches on downstream tasks such as semantic +segmentation of images and videos. + +
+
+
+
+
+ + ☆ Addressing the Impact of Localized Training Data in Graph Neural + Networks + + +
+ Graph Neural Networks (GNNs) have achieved notable success in learning from +graph-structured data, owing to their ability to capture intricate dependencies +and relationships between nodes. They excel in various applications, including +semi-supervised node classification, link prediction, and graph generation. +However, it is important to acknowledge that the majority of state-of-the-art +GNN models are built upon the assumption of an in-distribution setting, which +hinders their performance on real-world graphs with dynamic structures. In this +article, we aim to assess the impact of training GNNs on localized subsets of +the graph. Such restricted training data may lead to a model that performs well +in the specific region it was trained on but fails to generalize and make +accurate predictions for the entire graph. In the context of graph-based +semi-supervised learning (SSL), resource constraints often lead to scenarios +where the dataset is large, but only a portion of it can be labeled, affecting +the model's performance. This limitation affects tasks like anomaly detection +or spam detection when labeling processes are biased or influenced by human +subjectivity. To tackle the challenges posed by localized training data, we +approach the problem as an out-of-distribution (OOD) data issue by by aligning +the distributions between the training data, which represents a small portion +of labeled data, and the graph inference process that involves making +predictions for the entire graph. We propose a regularization method to +minimize distributional discrepancies between localized training data and graph +inference, improving model performance on OOD data. Extensive tests on popular +GNN models show significant performance improvement on three citation GNN +benchmark datasets. The regularization approach effectively enhances model +adaptation and generalization, overcoming challenges posed by OOD data. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ An Estimator for the Sensitivity to Perturbations of Deep Neural + Networks + + +
+ For Deep Neural Networks (DNNs) to become useful in safety-critical +applications, such as self-driving cars and disease diagnosis, they must be +stable to perturbations in input and model parameters. Characterizing the +sensitivity of a DNN to perturbations is necessary to determine minimal +bit-width precision that may be used to safely represent the network. However, +no general result exists that is capable of predicting the sensitivity of a +given DNN to round-off error, noise, or other perturbations in input. This +paper derives an estimator that can predict such quantities. The estimator is +derived via inequalities and matrix norms, and the resulting quantity is +roughly analogous to a condition number for the entire neural network. An +approximation of the estimator is tested on two Convolutional Neural Networks, +AlexNet and VGG-19, using the ImageNet dataset. For each of these networks, the +tightness of the estimator is explored via random perturbations and adversarial +attacks. + +
+
+ comment: Actual work and paper concluded in January 2019 +
+
+
+
+
+ + ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked + Image Modeling + + +
+ In dynamic Magnetic Resonance Imaging (MRI), k-space is typically +undersampled due to limited scan time, resulting in aliasing artifacts in the +image domain. Hence, dynamic MR reconstruction requires not only modeling +spatial frequency components in the x and y directions of k-space but also +considering temporal redundancy. Most previous works rely on image-domain +regularizers (priors) to conduct MR reconstruction. In contrast, we focus on +interpolating the undersampled k-space before obtaining images with Fourier +transform. In this work, we connect masked image modeling with k-space +interpolation and propose a novel Transformer-based k-space Global +Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among +low- and high-frequency components of 2D+t k-space and uses it to interpolate +unsampled data. Further, we propose a novel k-space Iterative Refinement Module +(k-IRM) to enhance the high-frequency components learning. We evaluate our +approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR +reconstruction methods with image-domain regularizers. Experiments show that +our proposed k-space interpolation method quantitatively and qualitatively +outperforms baseline methods. Importantly, the proposed approach achieves +substantially higher robustness and generalizability in cases of +highly-undersampled MR data. + +
+
+
+
+
+ + ☆ TransFusion: Generating Long, High Fidelity Time Series using Diffusion + Models with Transformers + + +
+ The generation of high-quality, long-sequenced time-series data is essential +due to its wide range of applications. In the past, standalone Recurrent and +Convolutional Neural Network-based Generative Adversarial Networks (GAN) were +used to synthesize time-series data. However, they are inadequate for +generating long sequences of time-series data due to limitations in the +architecture. Furthermore, GANs are well known for their training instability +and mode collapse problem. To address this, we propose TransFusion, a +diffusion, and transformers-based generative model to generate high-quality +long-sequence time-series data. We have stretched the sequence length to 384, +and generated high-quality synthetic data. To the best of our knowledge, this +is the first study that has been done with this long-sequence length. Also, we +introduce two evaluation metrics to evaluate the quality of the synthetic data +as well as its predictive characteristics. We evaluate TransFusion with a wide +variety of visual and empirical metrics, and TransFusion outperforms the +previous state-of-the-art by a significant margin. + +
+
+
+
+
+ + ☆ Online Continual Learning in Keyword Spotting for Low-Resource Devices + via Pooling High-Order Temporal Statistics INTERSPEECH 2023 + + +
+ Keyword Spotting (KWS) models on embedded devices should adapt fast to new +user-defined words without forgetting previous ones. Embedded devices have +limited storage and computational resources, thus, they cannot save samples or +update large models. We consider the setup of embedded online continual +learning (EOCL), where KWS models with frozen backbone are trained to +incrementally recognize new words from a non-repeated stream of samples, seen +one at a time. To this end, we propose Temporal Aware Pooling (TAP) which +constructs an enriched feature space computing high-order moments of speech +features extracted by a pre-trained backbone. Our method, TAP-SLDA, updates a +Gaussian model for each class on the enriched feature space to effectively use +audio representations. In experimental analyses, TAP-SLDA outperforms +competitors on several setups, backbones, and baselines, bringing a relative +average gain of 11.3% on the GSC dataset. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation + of rPPG + + +
+ Remote Photoplethysmography (rPPG) is a technology that utilizes the light +absorption properties of hemoglobin, captured via camera, to analyze and +measure blood volume pulse (BVP). By analyzing the measured BVP, various +physiological signals such as heart rate, stress levels, and blood pressure can +be derived, enabling applications such as the early prediction of +cardiovascular diseases. rPPG is a rapidly evolving field as it allows the +measurement of vital signals using camera-equipped devices without the need for +additional devices such as blood pressure monitors or pulse oximeters, and +without the assistance of medical experts. Despite extensive efforts and +advances in this field, serious challenges remain, including issues related to +skin color, camera characteristics, ambient lighting, and other sources of +noise, which degrade performance accuracy. We argue that fair and evaluable +benchmarking is urgently required to overcome these challenges and make any +meaningful progress from both academic and commercial perspectives. In most +existing work, models are trained, tested, and validated only on limited +datasets. Worse still, some studies lack available code or reproducibility, +making it difficult to fairly evaluate and compare performance. Therefore, the +purpose of this study is to provide a benchmarking framework to evaluate +various rPPG techniques across a wide range of datasets for fair evaluation and +comparison, including both conventional non-deep neural network (non-DNN) and +deep neural network (DNN) methods. GitHub URL: +https://github.com/remotebiosensing/rppg. + +
+
+ comment: 19 pages, 10 figures +
+
+
+
+
+ + ☆ Fake News Detection Through Graph-based Neural Networks: A Survey + + +
+ The popularity of online social networks has enabled rapid dissemination of +information. People now can share and consume information much more rapidly +than ever before. However, low-quality and/or accidentally/deliberately fake +information can also spread rapidly. This can lead to considerable and negative +impacts on society. Identifying, labelling and debunking online misinformation +as early as possible has become an increasingly urgent problem. Many methods +have been proposed to detect fake news including many deep learning and +graph-based approaches. In recent years, graph-based methods have yielded +strong results, as they can closely model the social context and propagation +process of online news. In this paper, we present a systematic review of fake +news detection studies based on graph-based and deep learning-based techniques. +We classify existing graph-based methods into knowledge-driven methods, +propagation-based methods, and heterogeneous social context-based methods, +depending on how a graph structure is constructed to model news related +information flows. We further discuss the challenges and open problems in +graph-based fake news detection and identify future research directions. + +
+
+ comment: 18 pages, 3 tables, 7 figures +
+
+
+
+
+ + ☆ Identifying drivers and mitigators for congestion and redispatch in the + German electric power system with explainable AI + + +
+ The transition to a sustainable energy supply challenges the operation of +electric power systems in manifold ways. Transmission grid loads increase as +wind and solar power are often installed far away from the consumers. In +extreme cases, system operators must intervene via countertrading or redispatch +to ensure grid stability. In this article, we provide a data-driven analysis of +congestion in the German transmission grid. We develop an explainable machine +learning model to predict the volume of redispatch and countertrade on an +hourly basis. The model reveals factors that drive or mitigate grid congestion +and quantifies their impact. We show that, as expected, wind power generation +is the main driver, but hydropower and cross-border electricity trading also +play an essential role. Solar power, on the other hand, has no mitigating +effect. Our results suggest that a change to the market design would alleviate +congestion. + +
+
+
+
+
+ + ☆ De-confounding Representation Learning for Counterfactual Inference on + Continuous Treatment via Generative Adversarial Network + + +
+ Counterfactual inference for continuous rather than binary treatment +variables is more common in real-world causal inference tasks. While there are +already some sample reweighting methods based on Marginal Structural Model for +eliminating the confounding bias, they generally focus on removing the +treatment's linear dependence on confounders and rely on the accuracy of the +assumed parametric models, which are usually unverifiable. In this paper, we +propose a de-confounding representation learning (DRL) framework for +counterfactual outcome estimation of continuous treatment by generating the +representations of covariates disentangled with the treatment variables. The +DRL is a non-parametric model that eliminates both linear and nonlinear +dependence between treatment and covariates. Specifically, we train the +correlations between the de-confounded representations and the treatment +variables against the correlations between the covariate representations and +the treatment variables to eliminate confounding bias. Further, a +counterfactual inference network is embedded into the framework to make the +learned representations serve both de-confounding and trusted inference. +Extensive experiments on synthetic datasets show that the DRL model performs +superiorly in learning de-confounding representations and outperforms +state-of-the-art counterfactual inference models for continuous treatment +variables. In addition, we apply the DRL model to a real-world medical dataset +MIMIC and demonstrate a detailed causal relationship between red cell width +distribution and mortality. + +
+
+ comment: 15 pages,4 figures +
+
+
+
+
+ + ☆ Predicting Ordinary Differential Equations with Transformers ICML 2023 + + +
+ We develop a transformer-based sequence-to-sequence model that recovers +scalar ordinary differential equations (ODEs) in symbolic form from irregularly +sampled and noisy observations of a single solution trajectory. We demonstrate +in extensive empirical evaluations that our model performs better or on par +with existing methods in terms of accurate recovery across various settings. +Moreover, our method is efficiently scalable: after one-time pretraining on a +large set of ODEs, we can infer the governing law of a new observed solution in +a few forward passes of the model. + +
+
+ comment: Published at ICML 2023 +
+
+
+
+
+ + ☆ ExWarp: Extrapolation and Warping-based Temporal Supersampling for + High-frequency Displays + + +
+ High-frequency displays are gaining immense popularity because of their +increasing use in video games and virtual reality applications. However, the +issue is that the underlying GPUs cannot continuously generate frames at this +high rate -- this results in a less smooth and responsive experience. +Furthermore, if the frame rate is not synchronized with the refresh rate, the +user may experience screen tearing and stuttering. Previous works propose +increasing the frame rate to provide a smooth experience on modern displays by +predicting new frames based on past or future frames. Interpolation and +extrapolation are two widely used algorithms that predict new frames. +Interpolation requires waiting for the future frame to make a prediction, which +adds additional latency. On the other hand, extrapolation provides a better +quality of experience because it relies solely on past frames -- it does not +incur any additional latency. The simplest method to extrapolate a frame is to +warp the previous frame using motion vectors; however, the warped frame may +contain improperly rendered visual artifacts due to dynamic objects -- this +makes it very challenging to design such a scheme. Past work has used DNNs to +get good accuracy, however, these approaches are slow. This paper proposes +Exwarp -- an approach based on reinforcement learning (RL) to intelligently +choose between the slower DNN-based extrapolation and faster warping-based +methods to increase the frame rate by 4x with an almost negligible reduction in +the perceived image quality. + +
+
+
+
+
+ + ☆ Concept backpropagation: An Explainable AI approach for visualising + learned concepts in neural network models + + +
+ Neural network models are widely used in a variety of domains, often as +black-box solutions, since they are not directly interpretable for humans. The +field of explainable artificial intelligence aims at developing explanation +methods to address this challenge, and several approaches have been developed +over the recent years, including methods for investigating what type of +knowledge these models internalise during the training process. Among these, +the method of concept detection, investigates which \emph{concepts} neural +network models learn to represent in order to complete their tasks. In this +work, we present an extension to the method of concept detection, named +\emph{concept backpropagation}, which provides a way of analysing how the +information representing a given concept is internalised in a given neural +network model. In this approach, the model input is perturbed in a manner +guided by a trained concept probe for the described model, such that the +concept of interest is maximised. This allows for the visualisation of the +detected concept directly in the input space of the model, which in turn makes +it possible to see what information the model depends on for representing the +described concept. We present results for this method applied to a various set +of input modalities, and discuss how our proposed method can be used to +visualise what information trained concept probes use, and the degree as to +which the representation of the probed concept is entangled within the neural +network model itself. + +
+
+
+
+
+ + ☆ Optimized data collection and analysis process for studying + solar-thermal desalination by machine learning + + +
+ An effective interdisciplinary study between machine learning and +solar-thermal desalination requires a sufficiently large and well-analyzed +experimental datasets. This study develops a modified dataset collection and +analysis process for studying solar-thermal desalination by machine learning. +Based on the optimized water condensation and collection process, the proposed +experimental method collects over one thousand datasets, which is ten times +more than the average number of datasets in previous works, by accelerating +data collection and reducing the time by 83.3%. On the other hand, the effects +of dataset features are investigated by using three different algorithms, +including artificial neural networks, multiple linear regressions, and random +forests. The investigation focuses on the effects of dataset size and range on +prediction accuracy, factor importance ranking, and the model's generalization +ability. The results demonstrate that a larger dataset can significantly +improve prediction accuracy when using artificial neural networks and random +forests. Additionally, the study highlights the significant impact of dataset +size and range on ranking the importance of influence factors. Furthermore, the +study reveals that the extrapolation data range significantly affects the +extrapolation accuracy of artificial neural networks. Based on the results, +massive dataset collection and analysis of dataset feature effects are +important steps in an effective and consistent machine learning process flow +for solar-thermal desalination, which can promote machine learning as a more +general tool in the field of solar-thermal desalination. + +
+
+
+
+
+ + ☆ InVAErt networks: a data-driven framework for emulation, inference and + identifiability analysis + + +
+ Use of generative models and deep learning for physics-based systems is +currently dominated by the task of emulation. However, the remarkable +flexibility offered by data-driven architectures would suggest to extend this +representation to other aspects of system synthesis including model inversion +and identifiability. We introduce inVAErt (pronounced \emph{invert}) networks, +a comprehensive framework for data-driven analysis and synthesis of parametric +physical systems which uses a deterministic encoder and decoder to represent +the forward and inverse solution maps, normalizing flow to capture the +probabilistic distribution of system outputs, and a variational encoder +designed to learn a compact latent representation for the lack of bijectivity +between inputs and outputs. We formally investigate the selection of penalty +coefficients in the loss function and strategies for latent space sampling, +since we find that these significantly affect both training and testing +performance. We validate our framework through extensive numerical examples, +including simple linear, nonlinear, and periodic maps, dynamical systems, and +spatio-temporal PDEs. + +
+
+
+
+
+ + ☆ Self-refining of Pseudo Labels for Music Source Separation with Noisy + Labeled Data + + +
+ Music source separation (MSS) faces challenges due to the limited +availability of correctly-labeled individual instrument tracks. With the push +to acquire larger datasets to improve MSS performance, the inevitability of +encountering mislabeled individual instrument tracks becomes a significant +challenge to address. This paper introduces an automated technique for refining +the labels in a partially mislabeled dataset. Our proposed self-refining +technique, employed with a noisy-labeled dataset, results in only a 1% accuracy +degradation in multi-label instrument recognition compared to a classifier +trained on a clean-labeled dataset. The study demonstrates the importance of +refining noisy-labeled data in MSS model training and shows that utilizing the +refined dataset leads to comparable results derived from a clean-labeled +dataset. Notably, upon only access to a noisy dataset, MSS models trained on a +self-refined dataset even outperform those trained on a dataset refined with a +classifier trained on clean labels. + +
+
+ comment: 24th International Society for Music Information Retrieval Conference + (ISMIR 2023) +
+
+
+
+
+ + ☆ Towards Generalising Neural Topical Representations + + +
+ Topic models have evolved from conventional Bayesian probabilistic models to +Neural Topic Models (NTMs) over the last two decays. Although NTMs have +achieved promising performance when trained and tested on a specific corpus, +their generalisation ability across corpora is rarely studied. In practice, we +often expect that an NTM trained on a source corpus can still produce quality +topical representation for documents in a different target corpus without +retraining. In this work, we aim to improve NTMs further so that their benefits +generalise reliably across corpora and tasks. To do so, we propose to model +similar documents by minimising their semantical distance when training NTMs. +Specifically, similar documents are created by data augmentation during +training; The semantical distance between documents is measured by the +Hierarchical Topic Transport Distance (HOTT), which computes the Optimal +Transport (OT) distance between the topical representations. Our framework can +be readily applied to most NTMs as a plug-and-play module. Extensive +experiments show that our framework significantly improves the generalisation +ability regarding neural topical representation across corpora. + +
+
+
+
+
+ + ☆ Homophily-Driven Sanitation View for Robust Graph Contrastive Learning + + +
+ We investigate adversarial robustness of unsupervised Graph Contrastive +Learning (GCL) against structural attacks. First, we provide a comprehensive +empirical and theoretical analysis of existing attacks, revealing how and why +they downgrade the performance of GCL. Inspired by our analytic results, we +present a robust GCL framework that integrates a homophily-driven sanitation +view, which can be learned jointly with contrastive learning. A key challenge +this poses, however, is the non-differentiable nature of the sanitation +objective. To address this challenge, we propose a series of techniques to +enable gradient-based end-to-end robust GCL. Moreover, we develop a fully +unsupervised hyperparameter tuning method which, unlike prior approaches, does +not require knowledge of node labels. We conduct extensive experiments to +evaluate the performance of our proposed model, GCHS (Graph Contrastive +Learning with Homophily-driven Sanitation View), against two state of the art +structural attacks on GCL. Our results demonstrate that GCHS consistently +outperforms all state of the art baselines in terms of the quality of generated +node embeddings as well as performance on two important downstream tasks. + +
+
+
+
+
+ + ☆ Continuation Path Learning for Homotopy Optimization ICML 2023 + + +
+ Homotopy optimization is a traditional method to deal with a complicated +optimization problem by solving a sequence of easy-to-hard surrogate +subproblems. However, this method can be very sensitive to the continuation +schedule design and might lead to a suboptimal solution to the original +problem. In addition, the intermediate solutions, often ignored by classic +homotopy optimization, could be useful for many real-world applications. In +this work, we propose a novel model-based approach to learn the whole +continuation path for homotopy optimization, which contains infinite +intermediate solutions for any surrogate subproblems. Rather than the classic +unidirectional easy-to-hard optimization, our method can simultaneously +optimize the original problem and all surrogate subproblems in a collaborative +manner. The proposed model also supports real-time generation of any +intermediate solution, which could be desirable for many applications. +Experimental studies on different problems show that our proposed method can +significantly improve the performance of homotopy optimization and provide +extra helpful information to support better decision-making. + +
+
+ comment: Accepted by the 40th International Conference on Machine Learning + (ICML 2023) +
+
+
+
+
+ + ☆ On the Connection between Pre-training Data Diversity and Fine-tuning + Robustness + + +
+ Pre-training has been widely adopted in deep learning to improve model +performance, especially when the training data for a target task is limited. In +our work, we seek to understand the implications of this training strategy on +the generalization properties of downstream models. More specifically, we ask +the following question: how do properties of the pre-training distribution +affect the robustness of a fine-tuned model? The properties we explore include +the label space, label semantics, image diversity, data domains, and data +quantity of the pre-training distribution. We find that the primary factor +influencing downstream effective robustness (Taori et al., 2020) is data +quantity, while other factors have limited significance. For example, reducing +the number of ImageNet pre-training classes by 4x while increasing the number +of images per class by 4x (that is, keeping total data quantity fixed) does not +impact the robustness of fine-tuned models. We demonstrate our findings on +pre-training distributions drawn from various natural and synthetic data +sources, primarily using the iWildCam-WILDS distribution shift as a test for +downstream robustness. + +
+
+
+
+
+ + ☆ Rethinking Medical Report Generation: Disease Revealing Enhancement with + Knowledge Graph + + +
+ Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG) +because it reveals the relations among diseases and thus can be utilized to +guide the generation process. However, constructing a comprehensive KG is +labor-intensive and its applications on the MRG process are under-explored. In +this study, we establish a complete KG on chest X-ray imaging that includes 137 +types of diseases and abnormalities. Based on this KG, we find that the current +MRG data sets exhibit a long-tailed problem in disease distribution. To +mitigate this problem, we introduce a novel augmentation strategy that enhances +the representation of disease types in the tail-end of the distribution. We +further design a two-stage MRG approach, where a classifier is first trained to +detect whether the input images exhibit any abnormalities. The classified +images are then independently fed into two transformer-based generators, +namely, ``disease-specific generator" and ``disease-free generator" to generate +the corresponding reports. To enhance the clinical evaluation of whether the +generated reports correctly describe the diseases appearing in the input image, +we propose diverse sensitivity (DS), a new metric that checks whether generated +diseases match ground truth and measures the diversity of all generated +diseases. Results show that the proposed two-stage generation framework and +augmentation strategies improve DS by a considerable margin, indicating a +notable reduction in the long-tailed problem associated with under-represented +diseases. + +
+
+
+
+
+ + ☆ Landslide Surface Displacement Prediction Based on VSXC-LSTM Algorithm + + +
+ Landslide is a natural disaster that can easily threaten local ecology, +people's lives and property. In this paper, we conduct modelling research on +real unidirectional surface displacement data of recent landslides in the +research area and propose a time series prediction framework named +VMD-SegSigmoid-XGBoost-ClusterLSTM (VSXC-LSTM) based on variational mode +decomposition, which can predict the landslide surface displacement more +accurately. The model performs well on the test set. Except for the random item +subsequence that is hard to fit, the root mean square error (RMSE) and the mean +absolute percentage error (MAPE) of the trend item subsequence and the periodic +item subsequence are both less than 0.1, and the RMSE is as low as 0.006 for +the periodic item prediction module based on XGBoost\footnote{Accepted in +ICANN2023}. + +
+
+
+
+
+ + ☆ Lost In Translation: Generating Adversarial Examples Robust to + Round-Trip Translation ICASSP + + +
+ Language Models today provide a high accuracy across a large number of +downstream tasks. However, they remain susceptible to adversarial attacks, +particularly against those where the adversarial examples maintain considerable +similarity to the original text. Given the multilingual nature of text, the +effectiveness of adversarial examples across translations and how machine +translations can improve the robustness of adversarial examples remain largely +unexplored. In this paper, we present a comprehensive study on the robustness +of current text adversarial attacks to round-trip translation. We demonstrate +that 6 state-of-the-art text-based adversarial attacks do not maintain their +efficacy after round-trip translation. Furthermore, we introduce an +intervention-based solution to this problem, by integrating Machine Translation +into the process of adversarial example generation and demonstrating increased +robustness to round-trip translation. Our results indicate that finding +adversarial examples robust to translation can help identify the insufficiency +of language models that is common across languages, and motivate further +research into multilingual adversarial attacks. + +
+
+ comment: Published at International Conference on Acoustics, Speech, and + Signal Processing (ICASSP) 2023 +
+
+
+
+
+ + ☆ DEPHN: Different Expression Parallel Heterogeneous Network using virtual + gradient optimization for Multi-task Learning + + +
+ Recommendation system algorithm based on multi-task learning (MTL) is the +major method for Internet operators to understand users and predict their +behaviors in the multi-behavior scenario of platform. Task correlation is an +important consideration of MTL goals, traditional models use shared-bottom +models and gating experts to realize shared representation learning and +information differentiation. However, The relationship between real-world tasks +is often more complex than existing methods do not handle properly sharing +information. In this paper, we propose an Different Expression Parallel +Heterogeneous Network (DEPHN) to model multiple tasks simultaneously. DEPHN +constructs the experts at the bottom of the model by using different feature +interaction methods to improve the generalization ability of the shared +information flow. In view of the model's differentiating ability for different +task information flows, DEPHN uses feature explicit mapping and virtual +gradient coefficient for expert gating during the training process, and +adaptively adjusts the learning intensity of the gated unit by considering the +difference of gating values and task correlation. Extensive experiments on +artificial and real-world datasets demonstrate that our proposed method can +capture task correlation in complex situations and achieve better performance +than baseline models\footnote{Accepted in IJCNN2023}. + +
+
+
+
+
+ + ☆ FaFCNN: A General Disease Classification Framework Based on Feature + Fusion Neural Networks + + +
+ There are two fundamental problems in applying deep learning/machine learning +methods to disease classification tasks, one is the insufficient number and +poor quality of training samples; another one is how to effectively fuse +multiple source features and thus train robust classification models. To +address these problems, inspired by the process of human learning knowledge, we +propose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which +introduces a feature-aware interaction module and a feature alignment module +based on domain adversarial learning. This is a general framework for disease +classification, and FaFCNN improves the way existing methods obtain sample +correlation features. The experimental results show that training using +augmented features obtained by pre-training gradient boosting decision tree +yields more performance gains than random-forest based methods. On the +low-quality dataset with a large amount of missing data in our setup, FaFCNN +obtains a consistently optimal performance compared to competitive baselines. +In addition, extensive experiments demonstrate the robustness of the proposed +method and the effectiveness of each component of the model\footnote{Accepted +in IEEE SMC2023}. + +
+
+
+
+
+ + ☆ An Empirical Evaluation of Temporal Graph Benchmark + + +
+ In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark +(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with +TGB, we include eleven popular dynamic graph learning methods for more +exhaustive comparisons. Through the experiments, we find that (1) some issues +need to be addressed in the current version of TGB, including mismatched data +statistics, inaccurate evaluation metric computation, and so on; (2) different +models depict varying performance across various datasets, which is in line +with previous observations; (3) the performance of some baselines can be +significantly improved over the reported results in TGB when using DyGLib. This +work aims to ease the researchers' efforts in evaluating various dynamic graph +learning methods on TGB and attempts to offer results that can be directly +referenced in the follow-up research. All the used resources in this project +are publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is +in progress, and feedback from the community is welcomed for improvements. + +
+
+ comment: preprint, in progress +
+
+
+
+
+ + ☆ AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion + Models + + +
+ Unrestricted adversarial attacks present a serious threat to deep learning +models and adversarial defense techniques. They pose severe security problems +for deep learning applications because they can effectively bypass defense +mechanisms. However, previous attack methods often utilize Generative +Adversarial Networks (GANs), which are not theoretically provable and thus +generate unrealistic examples by incorporating adversarial objectives, +especially for large-scale datasets like ImageNet. In this paper, we propose a +new method, called AdvDiff, to generate unrestricted adversarial examples with +diffusion models. We design two novel adversarial guidance techniques to +conduct adversarial sampling in the reverse generation process of diffusion +models. These two techniques are effective and stable to generate high-quality, +realistic adversarial examples by integrating gradients of the target +classifier interpretably. Experimental results on MNIST and ImageNet datasets +demonstrate that AdvDiff is effective to generate unrestricted adversarial +examples, which outperforms GAN-based methods in terms of attack performance +and generation quality. + +
+
+
+
+
+ + ☆ A faster and simpler algorithm for learning shallow networks + + +
+ We revisit the well-studied problem of learning a linear combination of $k$ +ReLU activations given labeled examples drawn from the standard $d$-dimensional +Gaussian measure. Chen et al. [CDG+23] recently gave the first algorithm for +this problem to run in $\text{poly}(d,1/\varepsilon)$ time when $k = O(1)$, +where $\varepsilon$ is the target error. More precisely, their algorithm runs +in time $(d/\varepsilon)^{\mathrm{quasipoly}(k)}$ and learns over multiple +stages. Here we show that a much simpler one-stage version of their algorithm +suffices, and moreover its runtime is only $(d/\varepsilon)^{O(k^2)}$. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Learning Universal and Robust 3D Molecular Representations with Graph + Convolutional Networks + + +
+ To learn accurate representations of molecules, it is essential to consider +both chemical and geometric features. To encode geometric information, many +descriptors have been proposed in constrained circumstances for specific types +of molecules and do not have the properties to be ``robust": 1. Invariant to +rotations and translations; 2. Injective when embedding molecular structures. +In this work, we propose a universal and robust Directional Node Pair (DNP) +descriptor based on the graph representations of 3D molecules. Our DNP +descriptor is robust compared to previous ones and can be applied to multiple +molecular types. To combine the DNP descriptor and chemical features in +molecules, we construct the Robust Molecular Graph Convolutional Network +(RoM-GCN) which is capable to take both node and edge features into +consideration when generating molecule representations. We evaluate our model +on protein and small molecule datasets. Our results validate the superiority of +the DNP descriptor in incorporating 3D geometric information of molecules. +RoM-GCN outperforms all compared baselines. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ Learning Resource Allocation Policy: Vertex-GNN or Edge-GNN? + + +
+ Graph neural networks (GNNs) update the hidden representations of vertices +(called Vertex-GNNs) or hidden representations of edges (called Edge-GNNs) by +processing and pooling the information of neighboring vertices and edges and +combining to incorporate graph topology. When learning resource allocation +policies, GNNs cannot perform well if their expressive power are weak, i.e., if +they cannot differentiate all input features such as channel matrices. In this +paper, we analyze the expressive power of the Vertex-GNNs and Edge-GNNs for +learning three representative wireless policies: link scheduling, power +control, and precoding policies. We find that the expressive power of the GNNs +depend on the linearity and output dimensions of the processing and combination +functions. When linear processors are used, the Vertex-GNNs cannot +differentiate all channel matrices due to the loss of channel information, +while the Edge-GNNs can. When learning the precoding policy, even the +Vertex-GNNs with non-linear processors may not be with strong expressive +ability due to the dimension compression. We proceed to provide necessary +conditions for the GNNs to well learn the precoding policy. Simulation results +validate the analyses and show that the Edge-GNNs can achieve the same +performance as the Vertex-GNNs with much lower training and inference time. + +
+
+
+
+
+ + ☆ Model-free generalized fiducial inference + + +
+ Motivated by the need for the development of safe and reliable methods for +uncertainty quantification in machine learning, I propose and develop ideas for +a model-free statistical framework for imprecise probabilistic prediction +inference. This framework facilitates uncertainty quantification in the form of +prediction sets that offer finite sample control of type 1 errors, a property +shared with conformal prediction sets, but this new approach also offers more +versatile tools for imprecise probabilistic reasoning. Furthermore, I propose +and consider the theoretical and empirical properties of a precise +probabilistic approximation to the model-free imprecise framework. +Approximating a belief/plausibility measure pair by an [optimal in some sense] +probability measure in the credal set is a critical resolution needed for the +broader adoption of imprecise probabilistic approaches to inference in +statistical and machine learning communities. It is largely undetermined in the +statistical and machine learning literatures, more generally, how to properly +quantify uncertainty in that there is no generally accepted standard of +accountability of stated uncertainties. The research I present in this +manuscript is aimed at motivating a framework for statistical inference with +reliability and accountability as the guiding principles. + +
+
+
+
+
+ + ☆ Rethinking Data Distillation: Do Not Overlook Calibration ICCV 2023 + + +
+ Neural networks trained on distilled data often produce over-confident output +and require correction by calibration methods. Existing calibration methods +such as temperature scaling and mixup work well for networks trained on +original large-scale data. However, we find that these methods fail to +calibrate networks trained on data distilled from large source datasets. In +this paper, we show that distilled data lead to networks that are not +calibratable due to (i) a more concentrated distribution of the maximum logits +and (ii) the loss of information that is semantically meaningful but unrelated +to classification tasks. To address this problem, we propose Masked Temperature +Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the +limitations of distilled data and achieve better calibration results while +maintaining the efficiency of dataset distillation. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Rates of Approximation by ReLU Shallow Neural Networks + + +
+ Neural networks activated by the rectified linear unit (ReLU) play a central +role in the recent development of deep learning. The topic of approximating +functions from H\"older spaces by these networks is crucial for understanding +the efficiency of the induced learning algorithms. Although the topic has been +well investigated in the setting of deep neural networks with many layers of +hidden neurons, it is still open for shallow networks having only one hidden +layer. In this paper, we provide rates of uniform approximation by these +networks. We show that ReLU shallow neural networks with $m$ hidden neurons can +uniformly approximate functions from the H\"older space $W_\infty^r([-1, 1]^d)$ +with rates $O((\log m)^{\frac{1}{2} +d}m^{-\frac{r}{d}\frac{d+2}{d+4}})$ when +$r +
+
+
+
+
+ + ♻ ☆ Exphormer: Sparse Transformers for Graphs + + +
+ Graph transformers have emerged as a promising architecture for a variety of +graph learning and representation tasks. Despite their successes, though, it +remains challenging to scale graph transformers to large graphs while +maintaining accuracy competitive with message-passing networks. In this paper, +we introduce Exphormer, a framework for building powerful and scalable graph +transformers. Exphormer consists of a sparse attention mechanism based on two +mechanisms: virtual global nodes and expander graphs, whose mathematical +characteristics, such as spectral expansion, pseduorandomness, and sparsity, +yield graph transformers with complexity only linear in the size of the graph, +while allowing us to prove desirable theoretical properties of the resulting +transformer models. We show that incorporating Exphormer into the +recently-proposed GraphGPS framework produces models with competitive empirical +results on a wide variety of graph datasets, including state-of-the-art results +on three datasets. We also show that Exphormer can scale to datasets on larger +graphs than shown in previous graph transformer architectures. Code can be +found at \url{https://github.com/hamed1375/Exphormer}. + +
+
+
+
+
+ + ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge ICCV 2023 + + +
+ Panoptic segmentation methods assign a known class to each pixel given in +input. Even for state-of-the-art approaches, this inevitably enforces decisions +that systematically lead to wrong predictions for objects outside the training +categories. However, robustness against out-of-distribution samples and corner +cases is crucial in safety-critical settings to avoid dangerous consequences. +Since real-world datasets cannot contain enough data points to adequately +sample the long tail of the underlying distribution, models must be able to +deal with unseen and unknown scenarios as well. Previous methods targeted this +by re-identifying already-seen unlabeled objects. In this work, we propose the +necessary step to extend segmentation with a new setting which we term holistic +segmentation. Holistic segmentation aims to identify and separate objects of +unseen unknown categories into instances, without any prior knowledge about +them, while performing panoptic segmentation of known classes. We tackle this +new problem with U3HS, which finds unknowns as highly uncertain regions and +clusters their corresponding instance-aware embeddings into individual objects. +By doing so, for the first time in panoptic segmentation with unknown objects, +our U3HS is trained without unknown categories, reducing assumptions and +leaving the settings as unconstrained as in real-life scenarios. Extensive +experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate +the effectiveness of U3HS for this new, challenging, and assumptions-free +setting called holistic segmentation. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ How Do Transformers Learn Topic Structure: Towards a Mechanistic + Understanding + + +
+ While the successes of transformers across many domains are indisputable, +accurate understanding of the learning mechanics is still largely lacking. +Their capabilities have been probed on benchmarks which include a variety of +structured and reasoning tasks -- but mathematical understanding is lagging +substantially behind. Recent lines of work have begun studying representational +aspects of this question: that is, the size/depth/complexity of attention-based +networks to perform certain tasks. However, there is no guarantee the learning +dynamics will converge to the constructions proposed. In our paper, we provide +fine-grained mechanistic understanding of how transformers learn "semantic +structure", understood as capturing co-occurrence structure of words. +Precisely, we show, through a combination of mathematical analysis and +experiments on Wikipedia data and synthetic data modeled by Latent Dirichlet +Allocation (LDA), that the embedding layer and the self-attention layer encode +the topical structure. In the former case, this manifests as higher average +inner product of embeddings between same-topic words. In the latter, it +manifests as higher average pairwise attention between same-topic words. The +mathematical results involve several assumptions to make the analysis +tractable, which we verify on data, and might be of independent interest as +well. + +
+
+
+
+
+ + ♻ ☆ Revisiting the Robustness of the Minimum Error Entropy Criterion: A + Transfer Learning Case Study ECAI-23 + + +
+ Coping with distributional shifts is an important part of transfer learning +methods in order to perform well in real-life tasks. However, most of the +existing approaches in this area either focus on an ideal scenario in which the +data does not contain noises or employ a complicated training paradigm or model +design to deal with distributional shifts. In this paper, we revisit the +robustness of the minimum error entropy (MEE) criterion, a widely used +objective in statistical signal processing to deal with non-Gaussian noises, +and investigate its feasibility and usefulness in real-life transfer learning +regression tasks, where distributional shifts are common. Specifically, we put +forward a new theoretical result showing the robustness of MEE against +covariate shift. We also show that by simply replacing the mean squared error +(MSE) loss with the MEE on basic transfer learning algorithms such as +fine-tuning and linear probing, we can achieve competitive performance with +respect to state-of-the-art transfer learning algorithms. We justify our +arguments on both synthetic data and 5 real-world time-series data. + +
+
+ comment: Manuscript accepted at ECAI-23. Code available at + https://github.com/lpsilvestrin/mee-finetune +
+
+
+
+
+ + ♻ ☆ An Approximation Theory for Metric Space-Valued Functions With A View + Towards Deep Learning + + +
+ Motivated by the developing mathematics of deep learning, we build universal +functions approximators of continuous maps between arbitrary Polish metric +spaces $\mathcal{X}$ and $\mathcal{Y}$ using elementary functions between +Euclidean spaces as building blocks. Earlier results assume that the target +space $\mathcal{Y}$ is a topological vector space. We overcome this limitation +by ``randomization'': our approximators output discrete probability measures +over $\mathcal{Y}$. When $\mathcal{X}$ and $\mathcal{Y}$ are Polish without +additional structure, we prove very general qualitative guarantees; when they +have suitable combinatorial structure, we prove quantitative guarantees for +H\"{o}lder-like maps, including maps between finite graphs, solution operators +to rough differential equations between certain Carnot groups, and continuous +non-linear operators between Banach spaces arising in inverse problems. In +particular, we show that the required number of Dirac measures is determined by +the combinatorial structure of $\mathcal{X}$ and $\mathcal{Y}$. For barycentric +$\mathcal{Y}$, including Banach spaces, $\mathbb{R}$-trees, Hadamard manifolds, +or Wasserstein spaces on Polish metric spaces, our approximators reduce to +$\mathcal{Y}$-valued functions. When the Euclidean approximators are neural +networks, our constructions generalize transformer networks, providing a new +probabilistic viewpoint of geometric deep learning. + +
+
+ comment: 14 Figures, 3 Tables, 78 Pages (Main 40, Proofs 26, Acknowledgments + and References 12) +
+
+
+
+
+ + ♻ ☆ Self-supervised Learning for Human Activity Recognition Using 700,000 + Person-days of Wearable Data + + +
+ Advances in deep learning for human activity recognition have been relatively +limited due to the lack of large labelled datasets. In this study, we leverage +self-supervised learning techniques on the UK-Biobank activity tracker +dataset--the largest of its kind to date--containing more than 700,000 +person-days of unlabelled wearable sensor data. Our resulting activity +recognition model consistently outperformed strong baselines across seven +benchmark datasets, with an F1 relative improvement of 2.5%-100% (median +18.4%), the largest improvements occurring in the smaller datasets. In contrast +to previous studies, our results generalise across external datasets, devices, +and environments. Our open-source model will help researchers and developers to +build customisable and generalisable activity classifiers with high +performance. + +
+
+
+
+
+ + ♻ ☆ Classification of US Supreme Court Cases using BERT-Based Techniques + + +
+ Models based on bidirectional encoder representations from transformers +(BERT) produce state of the art (SOTA) results on many natural language +processing (NLP) tasks such as named entity recognition (NER), part-of-speech +(POS) tagging etc. An interesting phenomenon occurs when classifying long +documents such as those from the US supreme court where BERT-based models can +be considered difficult to use on a first-pass or out-of-the-box basis. In this +paper, we experiment with several BERT-based classification techniques for US +supreme court decisions or supreme court database (SCDB) and compare them with +the previous SOTA results. We then compare our results specifically with SOTA +models for long documents. We compare our results for two classification tasks: +(1) a broad classification task with 15 categories and (2) a fine-grained +classification task with 279 categories. Our best result produces an accuracy +of 80\% on the 15 broad categories and 60\% on the fine-grained 279 categories +which marks an improvement of 8\% and 28\% respectively from previously +reported SOTA results. + +
+
+
+
+
+ + ♻ ☆ Learning Optimal Prescriptive Trees from Observational Data + + +
+ We consider the problem of learning an optimal prescriptive tree (i.e., an +interpretable treatment assignment policy in the form of a binary tree) of +moderate depth, from observational data. This problem arises in numerous +socially important domains such as public health and personalized medicine, +where interpretable and data-driven interventions are sought based on data +gathered in deployment -- through passive collection of data -- rather than +from randomized trials. We propose a method for learning optimal prescriptive +trees using mixed-integer optimization (MIO) technology. We show that under +mild conditions our method is asymptotically exact in the sense that it +converges to an optimal out-of-sample treatment assignment policy as the number +of historical data samples tends to infinity. Contrary to existing literature, +our approach: 1) does not require data to be randomized, 2) does not impose +stringent assumptions on the learned trees, and 3) has the ability to model +domain specific constraints. Through extensive computational experiments, we +demonstrate that our asymptotic guarantees translate to significant performance +improvements in finite samples, as well as showcase our uniquely flexible +modeling power by incorporating budget and fairness constraints. + +
+
+
+
+
+ + ♻ ☆ Approximate blocked Gibbs sampling for Bayesian neural networks + + +
+ In this work, minibatch MCMC sampling for feedforward neural networks is made +more feasible. To this end, it is proposed to sample subgroups of parameters +via a blocked Gibbs sampling scheme. By partitioning the parameter space, +sampling is possible irrespective of layer width. It is also possible to +alleviate vanishing acceptance rates for increasing depth by reducing the +proposal variance in deeper layers. Increasing the length of a non-convergent +chain increases the predictive accuracy in classification tasks, so avoiding +vanishing acceptance rates and consequently enabling longer chain runs have +practical benefits. Moreover, non-convergent chain realizations aid in the +quantification of predictive uncertainty. An open problem is how to perform +minibatch MCMC sampling for feedforward neural networks in the presence of +augmented data. + +
+
+
+
+
+ + ♻ ☆ Generalizing similarity in noisy setups: the DIBS phenomenon ECAI 2023 + + +
+ This work uncovers an interplay among data density, noise, and the +generalization ability in similarity learning. We consider Siamese Neural +Networks (SNNs), which are the basic form of contrastive learning, and explore +two types of noise that can impact SNNs, Pair Label Noise (PLN) and Single +Label Noise (SLN). Our investigation reveals that SNNs exhibit double descent +behaviour regardless of the training setup and that it is further exacerbated +by noise. We demonstrate that the density of data pairs is crucial for +generalization. When SNNs are trained on sparse datasets with the same amount +of PLN or SLN, they exhibit comparable generalization properties. However, when +using dense datasets, PLN cases generalize worse than SLN ones in the +overparametrized region, leading to a phenomenon we call Density-Induced Break +of Similarity (DIBS). In this regime, PLN similarity violation becomes +macroscopical, corrupting the dataset to the point where complete interpolation +cannot be achieved, regardless of the number of model parameters. Our analysis +also delves into the correspondence between online optimization and offline +generalization in similarity learning. The results show that this equivalence +fails in the presence of label noise in all the scenarios considered. + +
+
+ comment: v3: version accepted at ECAI 2023 + Supplementary Material +
+
+
+
+
+ + ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in + Multi-Modal LLMs + + +
+ We demonstrate how images and sounds can be used for indirect prompt and +instruction injection in multi-modal LLMs. An attacker generates an adversarial +perturbation corresponding to the prompt and blends it into an image or audio +recording. When the user asks the (unmodified, benign) model about the +perturbed image or audio, the perturbation steers the model to output the +attacker-chosen text and/or make the subsequent dialog follow the attacker's +instruction. We illustrate this attack with several proof-of-concept examples +targeting LLaVa and PandaGPT. + +
+
+
+
+
+ + ♻ ☆ Transformer Training Strategies for Forecasting Multiple Load Time + Series + + +
+ In the smart grid of the future, accurate load forecasts on the level of +individual clients can help to balance supply and demand locally and to prevent +grid outages. While the number of monitored clients will increase with the +ongoing smart meter rollout, the amount of data per client will always be +limited. We evaluate whether a Transformer load forecasting model benefits from +a transfer learning strategy, where a global univariate model is trained on the +load time series from multiple clients. In experiments with two datasets +containing load time series from several hundred clients, we find that the +global training strategy is superior to the multivariate and local training +strategies used in related work. On average, the global training strategy +results in 21.8% and 12.8% lower forecasting errors than the two other +strategies, measured across forecasting horizons from one day to one month into +the future. A comparison to linear models, multi-layer perceptrons and LSTMs +shows that Transformers are effective for load forecasting when they are +trained with the global training strategy. + +
+
+
+
+
+ + ♻ ☆ Automated patent extraction powers generative modeling in focused + chemical spaces + + +
+ Deep generative models have emerged as an exciting avenue for inverse +molecular design, with progress coming from the interplay between training +algorithms and molecular representations. One of the key challenges in their +applicability to materials science and chemistry has been the lack of access to +sizeable training datasets with property labels. Published patents contain the +first disclosure of new materials prior to their publication in journals, and +are a vast source of scientific knowledge that has remained relatively untapped +in the field of data-driven molecular design. Because patents are filed seeking +to protect specific uses, molecules in patents can be considered to be weakly +labeled into application classes. Furthermore, patents published by the US +Patent and Trademark Office (USPTO) are downloadable and have machine-readable +text and molecular structures. In this work, we train domain-specific +generative models using patent data sources by developing an automated pipeline +to go from USPTO patent digital files to the generation of novel candidates +with minimal human intervention. We test the approach on two in-class extracted +datasets, one in organic electronics and another in tyrosine kinase inhibitors. +We then evaluate the ability of generative models trained on these in-class +datasets on two categories of tasks (distribution learning and property +optimization), identify strengths and limitations, and suggest possible +explanations and remedies that could be used to overcome these in practice. + +
+
+ comment: Digital Discovery (2023) +
+
+
+
+
+ + ♻ ☆ Learning when to observe: A frugal reinforcement learning framework for + a high-cost world ECML-PKDD 2023 + + +
+ Reinforcement learning (RL) has been shown to learn sophisticated control +policies for complex tasks including games, robotics, heating and cooling +systems and text generation. The action-perception cycle in RL, however, +generally assumes that a measurement of the state of the environment is +available at each time step without a cost. In applications such as materials +design, deep-sea and planetary robot exploration and medicine, however, there +can be a high cost associated with measuring, or even approximating, the state +of the environment. In this paper, we survey the recently growing literature +that adopts the perspective that an RL agent might not need, or even want, a +costly measurement at each time step. Within this context, we propose the Deep +Dynamic Multi-Step Observationless Agent (DMSOA), contrast it with the +literature and empirically evaluate it on OpenAI gym and Atari Pong +environments. Our results, show that DMSOA learns a better policy with fewer +decision steps and measurements than the considered alternative from the +literature. The corresponding code is available at: +\url{https://github.com/cbellinger27/Learning-when-to-observe-in-RL + +
+
+ comment: Accepted for presentation at ECML-PKDD 2023 workshop track: + Simplification, Compression, Efficiency and Frugality for Artificial + Intelligence (SCEFA) +
+
+
+
+
+ + ♻ ☆ CPDG: A Contrastive Pre-Training Method for Dynamic Graph Neural + Networks + + +
+ Dynamic graph data mining has gained popularity in recent years due to the +rich information contained in dynamic graphs and their widespread use in the +real world. Despite the advances in dynamic graph neural networks (DGNNs), the +rich information and diverse downstream tasks have posed significant +difficulties for the practical application of DGNNs in industrial scenarios. To +this end, in this paper, we propose to address them by pre-training and present +the Contrastive Pre-Training Method for Dynamic Graph Neural Networks (CPDG). +CPDG tackles the challenges of pre-training for DGNNs, including generalization +capability and long-short term modeling capability, through a flexible +structural-temporal subgraph sampler along with structural-temporal contrastive +pre-training schemes. Extensive experiments conducted on both large-scale +research and industrial dynamic graph datasets show that CPDG outperforms +existing methods in dynamic graph pre-training for various downstream tasks +under three transfer settings. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Learning Temporally Extended Skills in Continuous Domains as Symbolic + Actions for Planning + + +
+ Problems which require both long-horizon planning and continuous control +capabilities pose significant challenges to existing reinforcement learning +agents. In this paper we introduce a novel hierarchical reinforcement learning +agent which links temporally extended skills for continuous control with a +forward model in a symbolic discrete abstraction of the environment's state for +planning. We term our agent SEADS for Symbolic Effect-Aware Diverse Skills. We +formulate an objective and corresponding algorithm which leads to unsupervised +learning of a diverse set of skills through intrinsic motivation given a known +state abstraction. The skills are jointly learned with the symbolic forward +model which captures the effect of skill execution in the state abstraction. +After training, we can leverage the skills as symbolic actions using the +forward model for long-horizon planning and subsequently execute the plan using +the learned continuous-action control skills. The proposed algorithm learns +skills and forward models that can be used to solve complex tasks which require +both continuous control and long-horizon planning capabilities with high +success rate. It compares favorably with other flat and hierarchical +reinforcement learning baseline agents and is successfully demonstrated with a +real robot. + +
+
+ comment: Project website (including video) is available at + https://seads.is.tue.mpg.de/. (v2) Accepted for publication at the 6th + Conference on Robot Learning (CoRL) 2022, Auckland, New Zealand. (v3) Added + details on checkpointing (S.8.1), with references on p.7, p.8, p.21 to + clarify number of env. steps of reported results +
+
+
+
+
+ + ♻ ☆ AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias + Estimation ECCV 2022 + + +
+ In Federated Learning (FL), a number of clients or devices collaborate to +train a model without sharing their data. Models are optimized locally at each +client and further communicated to a central hub for aggregation. While FL is +an appealing decentralized training paradigm, heterogeneity among data from +different clients can cause the local optimization to drift away from the +global objective. In order to estimate and therefore remove this drift, +variance reduction techniques have been incorporated into FL optimization +recently. However, these approaches inaccurately estimate the clients' drift +and ultimately fail to remove it properly. In this work, we propose an adaptive +algorithm that accurately estimates drift across clients. In comparison to +previous works, our approach necessitates less storage and communication +bandwidth, as well as lower compute costs. Additionally, our proposed +methodology induces stability by constraining the norm of estimates for client +drift, making it more practical for large scale FL. Experimental findings +demonstrate that the proposed algorithm converges significantly faster and +achieves higher accuracy than the baselines across various FL benchmarks. + +
+
+ comment: Published as a conference paper at ECCV 2022; Corrected some typos in + the text and a baseline algorithm +
+
+
+
+
+ + ♻ ☆ Deployment of Image Analysis Algorithms under Prevalence Shifts + + +
+ Domain gaps are among the most relevant roadblocks in the clinical +translation of machine learning (ML)-based solutions for medical image +analysis. While current research focuses on new training paradigms and network +architectures, little attention is given to the specific effect of prevalence +shifts on an algorithm deployed in practice. Such discrepancies between class +frequencies in the data used for a method's development/validation and that in +its deployment environment(s) are of great importance, for example in the +context of artificial intelligence (AI) democratization, as disease prevalences +may vary widely across time and location. Our contribution is twofold. First, +we empirically demonstrate the potentially severe consequences of missing +prevalence handling by analyzing (i) the extent of miscalibration, (ii) the +deviation of the decision threshold from the optimum, and (iii) the ability of +validation metrics to reflect neural network performance on the deployment +population as a function of the discrepancy between development and deployment +prevalence. Second, we propose a workflow for prevalence-aware image +classification that uses estimated deployment prevalences to adjust a trained +classifier to a new environment, without requiring additional annotated +deployment data. Comprehensive experiments based on a diverse set of 30 medical +classification tasks showcase the benefit of the proposed workflow in +generating better classifier decisions and more reliable performance estimates +compared to current practice. + +
+
+
+
+
+ + ♻ ☆ Rényi Divergence Deep Mutual Learning + + +
+ This paper revisits Deep Mutual Learning (DML), a simple yet effective +computing paradigm. We propose using R\'{e}nyi divergence instead of the KL +divergence, which is more flexible and tunable, to improve vanilla DML. This +modification is able to consistently improve performance over vanilla DML with +limited additional complexity. The convergence properties of the proposed +paradigm are analyzed theoretically, and Stochastic Gradient Descent with a +constant learning rate is shown to converge with $\mathcal{O}(1)$-bias in the +worst case scenario for nonconvex optimization tasks. That is, learning will +reach nearby local optima but continue searching within a bounded scope, which +may help mitigate overfitting. Finally, our extensive empirical results +demonstrate the advantage of combining DML and R\'{e}nyi divergence, leading to +further improvement in model generalization. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-based Anonymization of Chest Radiographs: A + Utility-preserving Measure for Patient Privacy MICCAI 2023 + + +
+ Robust and reliable anonymization of chest radiographs constitutes an +essential step before publishing large datasets of such for research purposes. +The conventional anonymization process is carried out by obscuring personal +information in the images with black boxes and removing or replacing +meta-information. However, such simple measures retain biometric information in +the chest radiographs, allowing patients to be re-identified by a linkage +attack. Therefore, there is an urgent need to obfuscate the biometric +information appearing in the images. We propose the first deep learning-based +approach (PriCheXy-Net) to targetedly anonymize chest radiographs while +maintaining data utility for diagnostic and machine learning purposes. Our +model architecture is a composition of three independent neural networks that, +when collectively used, allow for learning a deformation field that is able to +impede patient re-identification. Quantitative results on the ChestX-ray14 +dataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC) +after re-training with little impact on the abnormality classification +performance. This indicates the ability to preserve underlying abnormality +patterns while increasing patient privacy. Lastly, we compare our proposed +anonymization approach with two other obfuscation-based methods (Privacy-Net, +DP-Pix) and demonstrate the superiority of our method towards resolving the +privacy-utility trade-off for chest radiographs. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Generalizable Embeddings with Cross-batch Metric Learning + + +
+ Global average pooling (GAP) is a popular component in deep metric learning +(DML) for aggregating features. Its effectiveness is often attributed to +treating each feature vector as a distinct semantic entity and GAP as a +combination of them. Albeit substantiated, such an explanation's algorithmic +implications to learn generalizable entities to represent unseen classes, a +crucial DML goal, remain unclear. To address this, we formulate GAP as a convex +combination of learnable prototypes. We then show that the prototype learning +can be expressed as a recursive process fitting a linear predictor to a batch +of samples. Building on that perspective, we consider two batches of disjoint +classes at each iteration and regularize the learning by expressing the samples +of a batch with the prototypes that are fitted to the other batch. We validate +our approach on 4 popular DML benchmarks. + +
+
+ comment: \c{opyright} 2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Shuffled Multi-Channel Sparse Signal Recovery SP + + +
+ Mismatches between samples and their respective channel or target commonly +arise in several real-world applications. For instance, whole-brain calcium +imaging of freely moving organisms, multiple-target tracking or multi-person +contactless vital sign monitoring may be severely affected by mismatched +sample-channel assignments. To systematically address this fundamental problem, +we pose it as a signal reconstruction problem where we have lost +correspondences between the samples and their respective channels. Assuming +that we have a sensing matrix for the underlying signals, we show that the +problem is equivalent to a structured unlabeled sensing problem, and establish +sufficient conditions for unique recovery. To the best of our knowledge, a +sampling result for the reconstruction of shuffled multi-channel signals has +not been considered in the literature and existing methods for unlabeled +sensing cannot be directly applied. We extend our results to the case where the +signals admit a sparse representation in an overcomplete dictionary (i.e., the +sensing matrix is not precisely known), and derive sufficient conditions for +the reconstruction of shuffled sparse signals. We propose a robust +reconstruction method that combines sparse signal recovery with robust linear +regression for the two-channel case. The performance and robustness of the +proposed approach is illustrated in an application related to whole-brain +calcium imaging. The proposed methodology can be generalized to sparse signal +representations other than the ones considered in this work to be applied in a +variety of real-world problems with imprecise measurement or channel +assignment. + +
+
+ comment: Submitted to TSP +
+
+
+
+
+ + ♻ ☆ Reducing Training Time in Cross-Silo Federated Learning using Multigraph + Topology ICCV 2023 + + +
+ Federated learning is an active research topic since it enables several +participants to jointly train a model without sharing local data. Currently, +cross-silo federated learning is a popular training setting that utilizes a few +hundred reliable data silos with high-speed access links to training a model. +While this approach has been widely applied in real-world scenarios, designing +a robust topology to reduce the training time remains an open problem. In this +paper, we present a new multigraph topology for cross-silo federated learning. +We first construct the multigraph using the overlay graph. We then parse this +multigraph into different simple graphs with isolated nodes. The existence of +isolated nodes allows us to perform model aggregation without waiting for other +nodes, hence effectively reducing the training time. Intensive experiments on +three public datasets show that our proposed method significantly reduces the +training time compared with recent state-of-the-art topologies while +maintaining the accuracy of the learned model. Our code can be found at +https://github.com/aioz-ai/MultigraphFL + +
+
+ comment: accepted in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ BiofilmScanner: A Computational Intelligence Approach to Obtain + Bacterial Cell Morphological Attributes from Biofilm Image + + +
+ Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for +sulfate-reducing bacteria (SRB) that are associated with corrosion issues +caused by microorganisms. SRB-based biofilms are thought to be responsible for +the billion-dollar-per-year bio-corrosion of metal infrastructure. +Understanding the extraction of the bacterial cells' shape and size properties +in the SRB-biofilm at different growth stages will assist with the design of +anti-corrosion techniques. However, numerous issues affect current approaches, +including time-consuming geometric property extraction, low efficiency, and +high error rates. This paper proposes BiofilScanner, a Yolact-based deep +learning method integrated with invariant moments to address these problems. +Our approach efficiently detects and segments bacterial cells in an SRB image +while simultaneously invariant moments measure the geometric characteristics of +the segmented cells with low errors. The numerical experiments of the proposed +method demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our +earlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring +the geometric properties of the cell. Furthermore, the BiofilmScanner achieved +an F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67% +and 75.18%, respectively. + +
+
+ comment: Submitted to Pattern Recognition +
+
+
+
+
+ + ♻ ☆ Defining data science: a new field of inquiry + + +
+ Data science is not a science. It is a research paradigm. Its power, scope, +and scale will surpass science, our most powerful research paradigm, to enable +knowledge discovery and change our world. We have yet to understand and define +it, vital to realizing its potential and managing its risks. Modern data +science is in its infancy. Emerging slowly since 1962 and rapidly since 2000, +it is a fundamentally new field of inquiry, one of the most active, powerful, +and rapidly evolving 21st century innovations. Due to its value, power, and +applicability, it is emerging in over 40 disciplines, hundreds of research +areas, and thousands of applications. Millions of data science publications +contain myriad definitions of data science and data science problem solving. +Due to its infancy, many definitions are independent, application specific, +mutually incomplete, redundant, or inconsistent, hence so is data science. This +research addresses this data science multiple definitions challenge by +proposing the development of coherent, unified definition based on a data +science reference framework using a data science journal for the data science +community to achieve such a definition. This paper provides candidate +definitions for essential data science artifacts that are required to discuss +such a definition. They are based on the classical research paradigm concept +consisting of a philosophy of data science, the data science problem solving +paradigm, and the six component data science reference framework (axiology, +ontology, epistemology, methodology, methods, technology) that is a frequently +called for unifying framework with which to define, unify, and evolve data +science. It presents challenges for defining data science, solution approaches, +i.e., means for defining data science, and their requirements and benefits as +the basis of a comprehensive solution. + +
+
+
+
+
+ + ♻ ☆ NeRF-GAN Distillation for Efficient 3D-Aware Generation with + Convolutions + + +
+ Pose-conditioned convolutional generative models struggle with high-quality +3D-consistent image generation from single-view datasets, due to their lack of +sufficient 3D priors. Recently, the integration of Neural Radiance Fields +(NeRFs) and generative models, such as Generative Adversarial Networks (GANs), +has transformed 3D-aware generation from single-view images. NeRF-GANs exploit +the strong inductive bias of neural 3D representations and volumetric rendering +at the cost of higher computational complexity. This study aims at revisiting +pose-conditioned 2D GANs for efficient 3D-aware generation at inference time by +distilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and +effective method, based on re-using the well-disentangled latent space of a +pre-trained NeRF-GAN in a pose-conditioned convolutional network to directly +generate 3D-consistent images corresponding to the underlying 3D +representations. Experiments on several datasets demonstrate that the proposed +method obtains results comparable with volumetric rendering in terms of quality +and 3D consistency while benefiting from the computational advantage of +convolutional networks. The code will be available at: +https://github.com/mshahbazi72/NeRF-GAN-Distillation + +
+
+
+
+
+ + ♻ ☆ Typical and atypical solutions in non-convex neural networks with + discrete and continuous weights + + +
+ We study the binary and continuous negative-margin perceptrons as simple +non-convex neural network models learning random rules and associations. We +analyze the geometry of the landscape of solutions in both models and find +important similarities and differences. Both models exhibit subdominant +minimizers which are extremely flat and wide. These minimizers coexist with a +background of dominant solutions which are composed by an exponential number of +algorithmically inaccessible small clusters for the binary case (the frozen +1-RSB phase) or a hierarchical structure of clusters of different sizes for the +spherical case (the full RSB phase). In both cases, when a certain threshold in +constraint density is crossed, the local entropy of the wide flat minima +becomes non-monotonic, indicating a break-up of the space of robust solutions +into disconnected components. This has a strong impact on the behavior of +algorithms in binary models, which cannot access the remaining isolated +clusters. For the spherical case the behaviour is different, since even beyond +the disappearance of the wide flat minima the remaining solutions are shown to +always be surrounded by a large number of other solutions at any distance, up +to capacity. Indeed, we exhibit numerical evidence that algorithms seem to find +solutions up to the SAT/UNSAT transition, that we compute here using an 1RSB +approximation. For both models, the generalization performance as a learning +device is shown to be greatly improved by the existence of wide flat minimizers +even when trained in the highly underconstrained regime of very negative +margins. + +
+
+ comment: 34 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Lipschitz-regularized gradient flows and generative particle algorithms + for high-dimensional scarce data + + +
+ We build a new class of generative algorithms capable of efficiently learning +an arbitrary target distribution from possibly scarce, high-dimensional data +and subsequently generate new samples. These generative algorithms are +particle-based and are constructed as gradient flows of Lipschitz-regularized +Kullback-Leibler or other $f$-divergences, where data from a source +distribution can be stably transported as particles, towards the vicinity of +the target distribution. As a highlighted result in data integration, we +demonstrate that the proposed algorithms correctly transport gene expression +data points with dimension exceeding 54K, while the sample size is typically +only in the hundreds. + +
+
+
+
+
+ + ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed + Tomography via Deep Convolutional Neural Network based Artifact Reduction + + +
+ Purpose: Sparse-view computed tomography (CT) is an effective way to reduce +dose by lowering the total number of views acquired, albeit at the expense of +image quality, which, in turn, can impact the ability to detect diseases. We +explore deep learning-based artifact reduction in sparse-view cranial CT scans +and its impact on automated hemorrhage detection. Methods: We trained a U-Net +for artefact reduction on simulated sparse-view cranial CT scans from 3000 +patients obtained from a public dataset and reconstructed with varying levels +of sub-sampling. Additionally, we trained a convolutional neural network on +fully sampled CT data from 17,545 patients for automated hemorrhage detection. +We evaluated the classification performance using the area under the receiver +operator characteristic curves (AUC-ROCs) with corresponding 95% confidence +intervals (CIs) and the DeLong test, along with confusion matrices. The +performance of the U-Net was compared to an analytical approach based on total +variation (TV). Results: The U-Net performed superior compared to unprocessed +and TV-processed images with respect to image quality and automated hemorrhage +diagnosis. With U-Net post-processing, the number of views can be reduced from +4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973; +0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256 +views (0.967; 0.964-0.969) with a slight performance decrease (P<.001). +Conclusion: The results suggest that U-Net based artifact reduction +substantially enhances automated hemorrhage detection in sparse-view cranial +CTs. Our findings highlight that appropriate post-processing is crucial for +optimal image quality and diagnostic accuracy while minimizing radiation dose. + +
+
+ comment: 11 pages, 6 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Contrastive Learning and the Emergence of Attributes Associations + + +
+ In response to an object presentation, supervised learning schemes generally +respond with a parsimonious label. Upon a similar presentation we humans +respond again with a label, but are flooded, in addition, by a myriad of +associations. A significant portion of these consist of the presented object +attributes. Contrastive learning is a semi-supervised learning scheme based on +the application of identity preserving transformations on the object input +representations. It is conjectured in this work that these same applied +transformations preserve, in addition to the identity of the presented object, +also the identity of its semantically meaningful attributes. The corollary of +this is that the output representations of such a contrastive learning scheme +contain valuable information not only for the classification of the presented +object, but also for the presence or absence decision of any attribute of +interest. Simulation results which demonstrate this idea and the feasibility of +this conjecture are presented. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model + Predictive Control + + +
+ Model-based control requires an accurate model of the system dynamics for +precisely and safely controlling the robot in complex and dynamic environments. +Moreover, in the presence of variations in the operating conditions, the model +should be continuously refined to compensate for dynamics changes. In this +paper, we present a self-supervised learning approach that actively models the +dynamics of nonlinear robotic systems. We combine offline learning from past +experience and online learning from current robot interaction with the unknown +environment. These two ingredients enable a highly sample-efficient and +adaptive learning process, capable of accurately inferring model dynamics in +real-time even in operating regimes that greatly differ from the training +distribution. Moreover, we design an uncertainty-aware model predictive +controller that is heuristically conditioned to the aleatoric (data) +uncertainty of the learned dynamics. This controller actively chooses the +optimal control actions that (i) optimize the control performance and (ii) +improve the efficiency of online learning sample collection. We demonstrate the +effectiveness of our method through a series of challenging real-world +experiments using a quadrotor system. Our approach showcases high resilience +and generalization capabilities by consistently adapting to unseen flight +conditions, while it significantly outperforms classical and adaptive control +baselines. + +
+
+
+
+
+ + ♻ ☆ Predicting protein variants with equivariant graph neural networks ICML + + +
+ Pre-trained models have been successful in many protein engineering tasks. +Most notably, sequence-based models have achieved state-of-the-art performance +on protein fitness prediction while structure-based models have been used +experimentally to develop proteins with enhanced functions. However, there is a +research gap in comparing structure- and sequence-based methods for predicting +protein variants that are better than the wildtype protein. This paper aims to +address this gap by conducting a comparative study between the abilities of +equivariant graph neural networks (EGNNs) and sequence-based approaches to +identify promising amino-acid mutations. The results show that our proposed +structural approach achieves a competitive performance to sequence-based +methods while being trained on significantly fewer molecules. Additionally, we +find that combining assay labelled data with structure pre-trained models +yields similar trends as with sequence pre-trained models. + Our code and trained models can be found at: +https://github.com/semiluna/partIII-amino-acid-prediction. + +
+
+ comment: 4 pages, 2 figures, accepted to the 2023 ICML Workshop on + Computational Biology +
+
+
+
+
+ + ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly + Identification + + +
+ Failure to recognize samples from the classes unseen during training is a +major limitation of artificial intelligence in the real-world implementation +for recognition and classification of retinal anomalies. We established an +uncertainty-inspired open-set (UIOS) model, which was trained with fundus +images of 9 retinal conditions. Besides assessing the probability of each +category, UIOS also calculated an uncertainty score to express its confidence. +Our UIOS model with thresholding strategy achieved an F1 score of 99.55%, +97.01% and 91.91% for the internal testing set, external target categories +(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1 +score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS +correctly predicted high uncertainty scores, which would prompt the need for a +manual check in the datasets of non-target categories retinal diseases, +low-quality fundus images, and non-fundus images. UIOS provides a robust method +for real-world screening of retinal anomalies. + +
+
+
+
+
+ + ♻ ☆ Minimax Optimal Kernel Operator Learning via Multilevel Training ICLR 2023 + + +
+ Learning mappings between infinite-dimensional function spaces has achieved +empirical success in many disciplines of machine learning, including generative +modeling, functional data analysis, causal inference, and multi-agent +reinforcement learning. In this paper, we study the statistical limit of +learning a Hilbert-Schmidt operator between two infinite-dimensional Sobolev +reproducing kernel Hilbert spaces. We establish the information-theoretic lower +bound in terms of the Sobolev Hilbert-Schmidt norm and show that a +regularization that learns the spectral components below the bias contour and +ignores the ones that are above the variance contour can achieve the optimal +learning rate. At the same time, the spectral components between the bias and +variance contours give us flexibility in designing computationally feasible +machine learning algorithms. Based on this observation, we develop a multilevel +kernel operator learning algorithm that is optimal when learning linear +operators between infinite-dimensional function spaces. + +
+
+ comment: ICLR 2023 spotlight +
+
+
+
+
+ + ♻ ☆ Does Circuit Analysis Interpretability Scale? Evidence from Multiple + Choice Capabilities in Chinchilla + + +
+ \emph{Circuit analysis} is a promising technique for understanding the +internal mechanisms of language models. However, existing analyses are done in +small models far from the state of the art. To address this, we present a case +study of circuit analysis in the 70B Chinchilla model, aiming to test the +scalability of circuit analysis. In particular, we study multiple-choice +question answering, and investigate Chinchilla's capability to identify the +correct answer \emph{label} given knowledge of the correct answer \emph{text}. +We find that the existing techniques of logit attribution, attention pattern +visualization, and activation patching naturally scale to Chinchilla, allowing +us to identify and categorize a small set of `output nodes' (attention heads +and MLPs). + We further study the `correct letter' category of attention heads aiming to +understand the semantics of their features, with mixed results. For normal +multiple-choice question answers, we significantly compress the query, key and +value subspaces of the head without loss of performance when operating on the +answer labels for multiple-choice questions, and we show that the query and key +subspaces represent an `Nth item in an enumeration' feature to at least some +extent. However, when we attempt to use this explanation to understand the +heads' behaviour on a more general distribution including randomized answer +labels, we find that it is only a partial explanation, suggesting there is more +to learn about the operation of `correct letter' heads on multiple choice +question answering. + +
+
+
+
+
+ + ♻ ☆ S3M: Scalable Statistical Shape Modeling through Unsupervised + Correspondences MICCAI 2023 + + +
+ Statistical shape models (SSMs) are an established way to represent the +anatomy of a population with various clinically relevant applications. However, +they typically require domain expertise, and labor-intensive landmark +annotations to construct. We address these shortcomings by proposing an +unsupervised method that leverages deep geometric features and functional +correspondences to simultaneously learn local and global shape structures +across population anatomies. Our pipeline significantly improves unsupervised +correspondence estimation for SSMs compared to baseline methods, even on highly +irregular surface topologies. We demonstrate this for two different anatomical +structures: the thyroid and a multi-chamber heart dataset. Furthermore, our +method is robust enough to learn from noisy neural network predictions, +potentially enabling scaling SSMs to larger patient populations without manual +segmentation annotation. + +
+
+ comment: Accepted at MICCAI 2023. 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Deep learning based Meta-modeling for Multi-objective Technology + Optimization of Electrical Machines + + +
+ Optimization of rotating electrical machines is both time- and +computationally expensive. Because of the different parametrization, design +optimization is commonly executed separately for each machine technology. In +this paper, we present the application of a variational auto-encoder (VAE) to +optimize two different machine technologies simultaneously, namely an +asynchronous machine and a permanent magnet synchronous machine. After +training, we employ a deep neural network and a decoder as meta-models to +predict global key performance indicators (KPIs) and generate associated new +designs, respectively, through unified latent space in the optimization loop. +Numerical results demonstrate concurrent parametric multi-objective technology +optimization in the high-dimensional design space. The VAE-based approach is +quantitatively compared to a classical deep learning-based direct approach for +KPIs prediction. + +
+
+ comment: 12 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Deep Unfolded Simulated Bifurcation for Massive MIMO Signal Detection + + +
+ Multiple-input multiple-output (MIMO) is a key ingredient of next-generation +wireless communications. Recently, various MIMO signal detectors based on deep +learning techniques and quantum(-inspired) algorithms have been proposed to +improve the detection performance compared with conventional detectors. This +paper focuses on the simulated bifurcation (SB) algorithm, a quantum-inspired +algorithm. This paper proposes two techniques to improve its detection +performance. The first is modifying the algorithm inspired by the +Levenberg-Marquardt algorithm to eliminate local minima of maximum likelihood +detection. The second is the use of deep unfolding, a deep learning technique +to train the internal parameters of an iterative algorithm. We propose a +deep-unfolded SB by making the update rule of SB differentiable. The numerical +results show that these proposed detectors significantly improve the signal +detection performance in massive MIMO systems. + +
+
+ comment: 5pages, 4 figures; codes are available at + https://github.com/s-takabe/unfolded_simbif +
+
+
+
+
+ + ♻ ☆ Learning-Augmented B-Trees + + +
+ We study learning-augmented binary search trees (BSTs) and B-Trees via Treaps +with composite priorities. The result is a simple search tree where the depth +of each item is determined by its predicted weight $w_x$. To achieve the +result, each item $x$ has its composite priority +$-\lfloor\log\log(1/w_x)\rfloor + U(0, 1)$ where $U(0, 1)$ is the uniform +random variable. This generalizes the recent learning-augmented BSTs +[Lin-Luo-Woodruff ICML`22], which only work for Zipfian distributions, to +arbitrary inputs and predictions. It also gives the first B-Tree data structure +that can provably take advantage of localities in the access sequence via +online self-reorganization. The data structure is robust to prediction errors +and handles insertions, deletions, as well as prediction updates. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques + + +
+ In the contemporary digital landscape, online reviews have become an +indispensable tool for promoting products and services across various +businesses. Marketers, advertisers, and online businesses have found incentives +to create deceptive positive reviews for their products and negative reviews +for their competitors' offerings. As a result, the writing of deceptive reviews +has become an unavoidable practice for businesses seeking to promote themselves +or undermine their rivals. Detecting such deceptive reviews has become an +intense and ongoing area of research. This research paper proposes a machine +learning model to identify deceptive reviews, with a particular focus on +restaurants. This study delves into the performance of numerous experiments +conducted on a dataset of restaurant reviews known as the Deceptive Opinion +Spam Corpus. To accomplish this, an n-gram model and max features are developed +to effectively identify deceptive content, particularly focusing on fake +reviews. A benchmark study is undertaken to explore the performance of two +different feature extraction techniques, which are then coupled with five +distinct machine learning classification algorithms. The experimental results +reveal that the passive aggressive classifier stands out among the various +algorithms, showcasing the highest accuracy not only in text classification but +also in identifying fake reviews. Moreover, the research delves into data +augmentation and implements various deep learning techniques to further enhance +the process of detecting deceptive reviews. The findings shed light on the +efficacy of the proposed machine learning approach and offer valuable insights +into dealing with deceptive reviews in the realm of online businesses. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Stochastic MPC for energy hubs using data driven demand forecasting + + +
+ Energy hubs convert and distribute energy resources by combining different +energy inputs through multiple conversion and storage components. The optimal +operation of the energy hub exploits its flexibility to increase the energy +efficiency and reduce the operational costs. However, uncertainties in the +demand present challenges to energy hub optimization. In this paper, we propose +a stochastic MPC controller to minimize energy costs using chance constraints +for the uncertain electricity and thermal demands. Historical data is used to +build a demand prediction model based on Gaussian processes to generate a +forecast of the future electricity and heat demands. The stochastic +optimization problem is solved via the Scenario Approach by sampling multi-step +demand trajectories from the derived prediction model. The performance of the +proposed predictor and of the stochastic controller is verified on a simulated +energy hub model and demand data from a real building. + +
+
+ comment: 6 pages, 5 figures. Submitted to IFAC World Congress 2023 +
+
+
+
+
+ + ♻ ☆ Style Classification of Rabbinic Literature for Detection of Lost + Midrash Tanhuma Material + + +
+ Midrash collections are complex rabbinic works that consist of text in +multiple languages, which evolved through long processes of unstable oral and +written transmission. Determining the origin of a given passage in such a +compilation is not always straightforward and is often a matter of dispute +among scholars, yet it is essential for scholars' understanding of the passage +and its relationship to other texts in the rabbinic corpus. To help solve this +problem, we propose a system for classification of rabbinic literature based on +its style, leveraging recent advances in natural language processing for Hebrew +texts. Additionally, we demonstrate how this method can be applied to uncover +lost material from a specific midrash genre, Tan\d{h}uma-Yelammedenu, that has +been preserved in later anthologies. + +
+
+
+
+
+ + ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for + Dynamic Imaging + + +
+ Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at +each time instant using its undersampled measurements. In particular, in the +case of dynamic tomography, only a single projection at a single view angle may +be available at a time, making the problem severely ill-posed. In this work, we +propose an approach, RED-PSM, which combines for the first time two powerful +techniques to address this challenging imaging problem. The first, are +partially separable models, which have been used to efficiently introduce a +low-rank prior for the spatio-temporal object. The second is the recent +Regularization by Denoising (RED), which provides a flexible framework to +exploit the impressive performance of state-of-the-art image denoising +algorithms, for various inverse problems. We propose a partially separable +objective with RED and a computationally efficient and scalable optimization +scheme with variable splitting and ADMM. Theoretical analysis proves the +convergence of our objective to a value corresponding to a stationary point +satisfying the first-order optimality conditions. Convergence is accelerated by +a particular projection-domain-based initialization. We demonstrate the +performance and computational improvements of our proposed RED-PSM with a +learned image denoiser by comparing it to a recent deep-prior-based method +known as TD-DIP. Although the main focus is on dynamic tomography, we also show +the performance advantages of RED-PSM in a cardiac dynamic MRI setting. + +
+
+
+
+
+ + ♻ ☆ Nexus sine qua non: Essentially Connected Networks for Traffic + Forecasting + + +
+ Spatial-temporal graph neural networks (STGNNs) have become the de facto +models for learning spatiotemporal representations of traffic flow. However, +modern STGNNs often contain superfluous or obscure components, along with +complex techniques, posing significant challenges in terms of complexity and +scalability. Such concerns prompt us to rethink the design of neural +architectures and to identify the key challenges in traffic forecasting as +spatial-temporal contextualization. Here, we present an essentially connected +model based on an efficient message-passing backbone, powered by learnable node +embedding, without any complex sequential techniques such as TCNs, RNNs, and +Transformers. Intriguingly, empirical results demonstrate how a simple and +elegant model with contextualization capability compares favorably w.r.t. the +state-of-the-art with elaborate structures, while being much more interpretable +and computationally efficient for traffic forecasting. We anticipate that our +findings will open new horizons for further research to explore the possibility +of creating simple but effective neural forecasting architectures. + +
+
+
+
+
+ + ♻ ☆ Choosing Well Your Opponents: How to Guide the Synthesis of Programmatic + Strategies IJCAI + + +
+ This paper introduces Local Learner (2L), an algorithm for providing a set of +reference strategies to guide the search for programmatic strategies in +two-player zero-sum games. Previous learning algorithms, such as Iterated Best +Response (IBR), Fictitious Play (FP), and Double-Oracle (DO), can be +computationally expensive or miss important information for guiding search +algorithms. 2L actively selects a set of reference strategies to improve the +search signal. We empirically demonstrate the advantages of our approach while +guiding a local search algorithm for synthesizing strategies in three games, +including MicroRTS, a challenging real-time strategy game. Results show that 2L +learns reference strategies that provide a stronger search signal than IBR, FP, +and DO. We also simulate a tournament of MicroRTS, where a synthesizer using 2L +outperformed the winners of the two latest MicroRTS competitions, which were +programmatic strategies written by human programmers. + +
+
+ comment: International Joint Conference on Artificial Intelligence (IJCAI) + 2023 +
+
+
+
+
+ + ♻ ☆ Physics-aware Graph Neural Network for Accurate RNA 3D Structure + Prediction NeurIPS 2022 + + +
+ Biological functions of RNAs are determined by their three-dimensional (3D) +structures. Thus, given the limited number of experimentally determined RNA +structures, the prediction of RNA structures will facilitate elucidating RNA +functions and RNA-targeted drug discovery, but remains a challenging task. In +this work, we propose a Graph Neural Network (GNN)-based scoring function +trained only with the atomic types and coordinates on limited solved RNA 3D +structures for distinguishing accurate structural models. The proposed +Physics-aware Multiplex Graph Neural Network (PaxNet) separately models the +local and non-local interactions inspired by molecular mechanics. Furthermore, +PaxNet contains an attention-based fusion module that learns the individual +contribution of each interaction type for the final prediction. We rigorously +evaluate the performance of PaxNet on two benchmarks and compare it with +several state-of-the-art baselines. The results show that PaxNet significantly +outperforms all the baselines overall, and demonstrate the potential of PaxNet +for improving the 3D structure modeling of RNA and other macromolecules. Our +code is available at https://github.com/zetayue/Physics-aware-Multiplex-GNN. + +
+
+ comment: Accepted by the Machine Learning for Structural Biology Workshop + (MLSB) at the 36th Conference on Neural Information Processing Systems + (NeurIPS 2022) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 26 + +
+
+
+ + ☆ On the Effectiveness of Offline RL for Dialogue Response Generation ICML 2023 + + +
+ A common training technique for language models is teacher forcing (TF). TF +attempts to match human language exactly, even though identical meanings can be +expressed in different ways. This motivates use of sequence-level objectives +for dialogue response generation. In this paper, we study the efficacy of +various offline reinforcement learning (RL) methods to maximize such +objectives. We present a comprehensive evaluation across multiple datasets, +models, and metrics. Offline RL shows a clear performance improvement over +teacher forcing while not inducing training instability or sacrificing +practical training budgets. + +
+
+ comment: Accepted at ICML 2023. 18 pages, 12 figures. Code available at + https://github.com/asappresearch/dialogue-offline-rl +
+
+
+
+
+ + ☆ Testing Hateful Speeches against Policies + + +
+ In the recent years, many software systems have adopted AI techniques, +especially deep learning techniques. Due to their black-box nature, AI-based +systems brought challenges to traceability, because AI system behaviors are +based on models and data, whereas the requirements or policies are rules in the +form of natural or programming language. To the best of our knowledge, there is +a limited amount of studies on how AI and deep neural network-based systems +behave against rule-based requirements/policies. This experience paper examines +deep neural network behaviors against rule-based requirements described in +natural language policies. In particular, we focus on a case study to check +AI-based content moderation software against content moderation policies. +First, using crowdsourcing, we collect natural language test cases which match +each moderation policy, we name this dataset HateModerate; second, using the +test cases in HateModerate, we test the failure rates of state-of-the-art hate +speech detection software, and we find that these models have high failure +rates for certain policies; finally, since manual labeling is costly, we +further proposed an automated approach to augument HateModerate by finetuning +OpenAI's large language models to automatically match new examples to policies. +The dataset and code of this work can be found on our anonymous website: +\url{https://sites.google.com/view/content-moderation-project}. + +
+
+
+
+
+ + ☆ CommonsenseVIS: Visualizing and Understanding Commonsense Reasoning + Capabilities of Natural Language Models IEEE VIS + + +
+ Recently, large pretrained language models have achieved compelling +performance on commonsense benchmarks. Nevertheless, it is unclear what +commonsense knowledge the models learn and whether they solely exploit spurious +patterns. Feature attributions are popular explainability techniques that +identify important input concepts for model outputs. However, commonsense +knowledge tends to be implicit and rarely explicitly presented in inputs. These +methods cannot infer models' implicit reasoning over mentioned concepts. We +present CommonsenseVIS, a visual explanatory system that utilizes external +commonsense knowledge bases to contextualize model behavior for commonsense +question-answering. Specifically, we extract relevant commonsense knowledge in +inputs as references to align model behavior with human knowledge. Our system +features multi-level visualization and interactive model probing and editing +for different concepts and their underlying relations. Through a user study, we +show that CommonsenseVIS helps NLP experts conduct a systematic and scalable +visual analysis of models' relational reasoning over concepts in different +situations. + +
+
+ comment: This paper is accepted by IEEE VIS, 2023. To appear in IEEE + Transactions on Visualization and Computer Graphics (IEEE TVCG). 14 pages, 11 + figures +
+
+
+
+
+ + ☆ In-Context Learning in Large Language Models Learns Label Relationships + but Is Not Conventional Learning + + +
+ The performance of Large Language Models (LLMs) on downstream tasks often +improves significantly when including examples of the input-label relationship +in the context. However, there is currently no consensus about how this +in-context learning (ICL) ability of LLMs works: for example, while Xie et al. +(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b) +argue ICL does not even learn label relationships from in-context examples. In +this paper, we study (1) how labels of in-context examples affect predictions, +(2) how label relationships learned during pre-training interact with +input-label examples provided in-context, and (3) how ICL aggregates label +information across in-context examples. Our findings suggests LLMs usually +incorporate information from in-context labels, but that pre-training and +in-context label relationships are treated differently, and that the model does +not consider all in-context information equally. Our results give insights into +understanding and aligning LLM behavior. + +
+
+
+
+
+ + ☆ Evaluating Emotional Nuances in Dialogue Summarization + + +
+ Automatic dialogue summarization is a well-established task that aims to +identify the most important content from human conversations to create a short +textual summary. Despite recent progress in the field, we show that most of the +research has focused on summarizing the factual information, leaving aside the +affective content, which can yet convey useful information to analyse, monitor, +or support human interactions. In this paper, we propose and evaluate a set of +measures $PEmo$, to quantify how much emotion is preserved in dialog summaries. +Results show that, summarization models of the state-of-the-art do not preserve +well the emotional content in the summaries. We also show that by reducing the +training set to only emotional dialogues, the emotional content is better +preserved in the generated summaries, while conserving the most salient factual +information. + +
+
+
+
+
+ + ☆ Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences + from Longitudinal Electronic Health Records of US Military Veterans + + +
+ Early prediction of Alzheimer's disease (AD) is crucial for timely +intervention and treatment. This study aims to use machine learning approaches +to analyze longitudinal electronic health records (EHRs) of patients with AD +and identify signs and symptoms that can predict AD onset earlier. We used a +case-control design with longitudinal EHRs from the U.S. Department of Veterans +Affairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA +patients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9 +with controls by age, sex and clinical utilization with replacement. We used a +panel of AD-related keywords and their occurrences over time in a patient's +longitudinal EHRs as predictors for AD prediction with four machine learning +models. We performed subgroup analyses by age, sex, and race/ethnicity, and +validated the model in a hold-out and "unseen" VHA stations group. Model +discrimination, calibration, and other relevant metrics were reported for +predictions up to ten years before ICD-based diagnosis. The study population +included 16,701 cases and 39,097 matched controls. The average number of +AD-related keywords (e.g., "concentration", "speaking") per year increased +rapidly for cases as diagnosis approached, from around 10 to over 40, while +remaining flat at 10 for controls. The best model achieved high discriminative +accuracy (ROCAUC 0.997) for predictions using data from at least ten years +before ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow +goodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and +race/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine +learning models using AD-related keywords identified from EHR notes can predict +future AD diagnoses, suggesting its potential use for identifying AD risk using +EHR notes, offering an affordable way for early screening on large population. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ X-CapsNet For Fake News Detection + + +
+ News consumption has significantly increased with the growing popularity and +use of web-based forums and social media. This sets the stage for misinforming +and confusing people. To help reduce the impact of misinformation on users' +potential health-related decisions and other intents, it is desired to have +machine learning models to detect and combat fake news automatically. This +paper proposes a novel transformer-based model using Capsule neural +Networks(CapsNet) called X-CapsNet. This model includes a CapsNet with dynamic +routing algorithm paralyzed with a size-based classifier for detecting short +and long fake news statements. We use two size-based classifiers, a Deep +Convolutional Neural Network (DCNN) for detecting long fake news statements and +a Multi-Layer Perceptron (MLP) for detecting short news statements. To resolve +the problem of representing short news statements, we use indirect features of +news created by concatenating the vector of news speaker profiles and a vector +of polarity, sentiment, and counting words of news statements. For evaluating +the proposed architecture, we use the Covid-19 and the Liar datasets. The +results in terms of the F1-score for the Covid-19 dataset and accuracy for the +Liar dataset show that models perform better than the state-of-the-art +baselines. + +
+
+
+
+
+ + ☆ Milimili. Collecting Parallel Data via Crowdsourcing + + +
+ We present a methodology for gathering a parallel corpus through +crowdsourcing, which is more cost-effective than hiring professional +translators, albeit at the expense of quality. Additionally, we have made +available experimental parallel data collected for Chechen-Russian and +Fula-English language pairs. + +
+
+
+
+
+ + ☆ Towards Automatic Boundary Detection for Human-AI Hybrid Essay in + Education + + +
+ Human-AI collaborative writing has been greatly facilitated with the help of +modern large language models (LLM), e.g., ChatGPT. While admitting the +convenience brought by technology advancement, educators also have concerns +that students might leverage LLM to partially complete their writing assignment +and pass off the human-AI hybrid text as their original work. Driven by such +concerns, in this study, we investigated the automatic detection of Human-AI +hybrid text in education, where we formalized the hybrid text detection as a +boundary detection problem, i.e., identifying the transition points between +human-written content and AI-generated content. We constructed a hybrid essay +dataset by partially removing sentences from the original student-written +essays and then instructing ChatGPT to fill in for the incomplete essays. Then +we proposed a two-step detection approach where we (1) Separated AI-generated +content from human-written content during the embedding learning process; and +(2) Calculated the distances between every two adjacent prototypes (a prototype +is the mean of a set of consecutive sentences from the hybrid text in the +embedding space) and assumed that the boundaries exist between the two +prototypes that have the furthest distance from each other. Through extensive +experiments, we summarized the following main findings: (1) The proposed +approach consistently outperformed the baseline methods across different +experiment settings; (2) The embedding learning process (i.e., step 1) can +significantly boost the performance of the proposed approach; (3) When +detecting boundaries for single-boundary hybrid essays, the performance of the +proposed approach could be enhanced by adopting a relatively large prototype +size, leading to a $22$\% improvement (against the second-best baseline method) +in the in-domain setting and an $18$\% improvement in the out-of-domain +setting. + +
+
+ comment: 9 pages including references, 2 figures +
+
+
+
+
+ + ☆ Transformer-based Joint Source Channel Coding for Textual Semantic + Communication + + +
+ The Space-Air-Ground-Sea integrated network calls for more robust and secure +transmission techniques against jamming. In this paper, we propose a textual +semantic transmission framework for robust transmission, which utilizes the +advanced natural language processing techniques to model and encode sentences. +Specifically, the textual sentences are firstly split into tokens using +wordpiece algorithm, and are embedded to token vectors for semantic extraction +by Transformer-based encoder. The encoded data are quantized to a fixed length +binary sequence for transmission, where binary erasure, symmetric, and deletion +channels are considered for transmission. The received binary sequences are +further decoded by the transformer decoders into tokens used for sentence +reconstruction. Our proposed approach leverages the power of neural networks +and attention mechanism to provide reliable and efficient communication of +textual data in challenging wireless environments, and simulation results on +semantic similarity and bilingual evaluation understudy prove the superiority +of the proposed model in semantic transmission. + +
+
+ comment: 6 pages, 5 figures. Accepted by IEEE/CIC ICCC 2023 +
+
+
+
+
+ + ☆ A meta learning scheme for fast accent domain expansion in Mandarin + speech recognition + + +
+ Spoken languages show significant variation across mandarin and accent. +Despite the high performance of mandarin automatic speech recognition (ASR), +accent ASR is still a challenge task. In this paper, we introduce meta-learning +techniques for fast accent domain expansion in mandarin speech recognition, +which expands the field of accents without deteriorating the performance of +mandarin ASR. Meta-learning or learn-to-learn can learn general relation in +multi domains not only for over-fitting a specific domain. So we select +meta-learning in the domain expansion task. This more essential learning will +cause improved performance on accent domain extension tasks. We combine the +methods of meta learning and freeze of model parameters, which makes the +recognition performance more stable in different cases and the training faster +about 20%. Our approach significantly outperforms other methods about 3% +relatively in the accent domain expansion task. Compared to the baseline model, +it improves relatively 37% under the condition that the mandarin test set +remains unchanged. In addition, it also proved this method to be effective on a +large amount of data with a relative performance improvement of 4% on the +accent test set. + +
+
+
+
+
+ + ☆ Exploring the Integration of Speech Separation and Recognition with + Self-Supervised Learning Representation SP + + +
+ Neural speech separation has made remarkable progress and its integration +with automatic speech recognition (ASR) is an important direction towards +realizing multi-speaker ASR. This work provides an insightful investigation of +speech separation in reverberant and noisy-reverberant scenarios as an ASR +front-end. In detail, we explore multi-channel separation methods, mask-based +beamforming and complex spectral mapping, as well as the best features to use +in the ASR back-end model. We employ the recent self-supervised learning +representation (SSLR) as a feature and improve the recognition performance from +the case with filterbank features. To further improve multi-speaker recognition +performance, we present a carefully designed training strategy for integrating +speech separation and recognition with SSLR. The proposed integration using +TF-GridNet-based complex spectral mapping and WavLM-based SSLR achieves a 2.5% +word error rate in reverberant WHAMR! test set, significantly outperforming an +existing mask-based MVDR beamforming and filterbank integration (28.9%). + +
+
+ comment: Accepted to IEEE WASPAA 2023 +
+
+
+
+
+ + ☆ FATRER: Full-Attention Topic Regularizer for Accurate and Robust + Conversational Emotion Recognition + + +
+ This paper concentrates on the understanding of interlocutors' emotions +evoked in conversational utterances. Previous studies in this literature mainly +focus on more accurate emotional predictions, while ignoring model robustness +when the local context is corrupted by adversarial attacks. To maintain +robustness while ensuring accuracy, we propose an emotion recognizer augmented +by a full-attention topic regularizer, which enables an emotion-related global +view when modeling the local context in a conversation. A joint topic modeling +strategy is introduced to implement regularization from both representation and +loss perspectives. To avoid over-regularization, we drop the constraints on +prior distributions that exist in traditional topic modeling and perform +probabilistic approximations based entirely on attention alignment. Experiments +show that our models obtain more favorable results than state-of-the-art +models, and gain convincing robustness under three types of adversarial +attacks. + +
+
+
+
+
+ + ♻ ☆ MenuCraft: Interactive Menu System Design with Large Language Models + + +
+ Menu system design is a challenging task involving many design options and +various human factors. For example, one crucial factor that designers need to +consider is the semantic and systematic relation of menu commands. However, +capturing these relations can be challenging due to limited available +resources. With the advancement of neural language models, large language +models can utilize their vast pre-existing knowledge in designing and refining +menu systems. In this paper, we propose MenuCraft, an AI-assisted designer for +menu design that enables collaboration between the designer and a dialogue +system to design menus. MenuCraft offers an interactive language-based menu +design tool that simplifies the menu design process and enables easy +customization of design options. MenuCraft supports a variety of interactions +through dialog that allows performing zero/few-shot learning. + +
+
+
+
+
+ + ♻ ☆ Neural Natural Language Processing for Long Texts: A Survey of the + State-of-the-Art + + +
+ The adoption of Deep Neural Networks (DNNs) has greatly benefited Natural +Language Processing (NLP) during the past decade. However, the demands of long +document analysis are quite different from those of shorter texts, while the +ever increasing size of documents uploaded on-line renders automated +understanding of lengthy texts a critical issue. Relevant applications include +automated Web mining, legal document review, medical records analysis, +financial reports analysis, contract management, environmental impact +assessment, news aggregation, etc. Despite the relatively recent development of +efficient algorithms for analyzing long documents, practical tools in this +field are currently flourishing. This article serves as an entry point into +this dynamic domain and aims to achieve two objectives. Firstly, it provides an +overview of the relevant neural building blocks, serving as a concise tutorial +for the field. Secondly, it offers a brief examination of the current +state-of-the-art in long document NLP, with a primary focus on two key tasks: +document classification and document summarization. Sentiment analysis for long +texts is also covered, since it is typically treated as a particular case of +document classification. Consequently, this article presents an introductory +exploration of document-level analysis, addressing the primary challenges, +concerns, and existing solutions. Finally, the article presents publicly +available annotated datasets that can facilitate further research in this area. + +
+
+ comment: 58 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Log-linear Guardedness and its Implications ACL 2023 + + +
+ Methods for erasing human-interpretable concepts from neural representations +that assume linearity have been found to be tractable and useful. However, the +impact of this removal on the behavior of downstream classifiers trained on the +modified representations is not fully understood. In this work, we formally +define the notion of log-linear guardedness as the inability of an adversary to +predict the concept directly from the representation, and study its +implications. We show that, in the binary case, under certain assumptions, a +downstream log-linear model cannot recover the erased concept. However, we +demonstrate that a multiclass log-linear model \emph{can} be constructed that +indirectly recovers the concept in some cases, pointing to the inherent +limitations of log-linear guardedness as a downstream bias mitigation +technique. These findings shed light on the theoretical limitations of linear +erasure methods and highlight the need for further research on the connections +between intrinsic and extrinsic bias in neural models. + +
+
+ comment: Accepted as a long paper in ACL 2023 +
+
+
+
+
+ + ♻ ☆ Comparing Apples to Apples: Generating Aspect-Aware Comparative + Sentences from User Reviews + + +
+ It is time-consuming to find the best product among many similar +alternatives. Comparative sentences can help to contrast one item from others +in a way that highlights important features of an item that stand out. Given +reviews of one or multiple items and relevant item features, we generate +comparative review sentences to aid users to find the best fit. Specifically, +our model consists of three successive components in a transformer: (i) an item +encoding module to encode an item for comparison, (ii) a comparison generation +module that generates comparative sentences in an autoregressive manner, (iii) +a novel decoding method for user personalization. We show that our pipeline +generates fluent and diverse comparative sentences. We run experiments on the +relevance and fidelity of our generated sentences in a human evaluation study +and find that our algorithm creates comparative review sentences that are +relevant and truthful. + +
+
+
+
+
+ + ♻ ☆ Investigating the Factual Knowledge Boundary of Large Language Models + with Retrieval Augmentation + + +
+ Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require +a substantial amount of factual knowledge and often rely on external +information for assistance. Recently, large language models (LLMs) (e.g., +ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks +with world knowledge, including knowledge-intensive tasks. However, it remains +unclear how well LLMs are able to perceive their factual knowledge boundaries, +particularly how they behave when incorporating retrieval augmentation. In this +study, we present an initial analysis of the factual knowledge boundaries of +LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially, +we focus on three primary research questions and analyze them by examining QA +performance, priori judgement and posteriori judgement of LLMs. We show +evidence that LLMs possess unwavering confidence in their capabilities to +respond to questions and the accuracy of their responses. Furthermore, +retrieval augmentation proves to be an effective approach in enhancing LLMs' +awareness of knowledge boundaries, thereby improving their judgemental +abilities. Additionally, we also find that LLMs have a propensity to rely on +the provided retrieval results when formulating answers, while the quality of +these results significantly impacts their reliance. The code to reproduce this +work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary. + +
+
+
+
+
+ + ♻ ☆ LAnoBERT: System Log Anomaly Detection based on BERT Masked Language + Model + + +
+ The system log generated in a computer system refers to large-scale data that +are collected simultaneously and used as the basic data for determining errors, +intrusion and abnormal behaviors. The aim of system log anomaly detection is to +promptly identify anomalies while minimizing human intervention, which is a +critical problem in the industry. Previous studies performed anomaly detection +through algorithms after converting various forms of log data into a +standardized template using a parser. Particularly, a template corresponding to +a specific event should be defined in advance for all the log data using which +the information within the log key may get lost. In this study, we propose +LAnoBERT, a parser free system log anomaly detection method that uses the BERT +model, exhibiting excellent natural language processing performance. The +proposed method, LAnoBERT, learns the model through masked language modeling, +which is a BERT-based pre-training method, and proceeds with unsupervised +learning-based anomaly detection using the masked language modeling loss +function per log key during the test process. In addition, we also propose an +efficient inference process to establish a practically applicable pipeline to +the actual system. Experiments on three well-known log datasets, i.e., HDFS, +BGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly +detection performance compared to unsupervised learning-based benchmark models, +but also it resulted in a comparable performance with supervised learning-based +benchmark models. + +
+
+
+
+
+ + ♻ ☆ ELVIS: Empowering Locality of Vision Language Pre-training with + Intra-modal Similarity + + +
+ Deep learning has shown great potential in assisting radiologists in reading +chest X-ray (CXR) images, but its need for expensive annotations for improving +performance prevents widespread clinical application. Visual language +pre-training (VLP) can alleviate the burden and cost of annotation by +leveraging routinely generated reports for radiographs, which exist in large +quantities as well as in paired form (image-text pairs). Additionally, +extensions to localization-aware VLPs are being proposed to address the needs +for accurate localization of abnormalities for computer-aided diagnosis (CAD) +in CXR. However, we find that the formulation proposed by locality-aware VLP +literature actually leads to a loss in spatial relationships required for +downstream localization tasks. Therefore, we propose Empowering Locality of VLP +with Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to +better preserve the locality within radiographs or reports, which enhances the +ability to comprehend location references in text reports. Our locality-aware +VLP method significantly outperforms state-of-the art baselines in multiple +segmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show +that ELVIS focuses well on regions of interest described in the report text +compared to prior approaches, allowing for enhanced interpretability. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ DSTEA: Improving Dialogue State Tracking via Entity Adaptive + Pre-training + + +
+ Dialogue State Tracking (DST) is critical for comprehensively interpreting +user and system utterances, thereby forming the cornerstone of efficient +dialogue systems. Despite past research efforts focused on enhancing DST +performance through alterations to the model structure or integrating +additional features like graph relations, they often require additional +pre-training with external dialogue corpora. In this study, we propose DSTEA, +improving Dialogue State Tracking via Entity Adaptive pre-training, which can +enhance the encoder through by intensively training key entities in dialogue +utterances. DSTEA identifies these pivotal entities from input dialogues +utilizing four different methods: ontology information, named-entity +recognition, the spaCy, and the flair library. Subsequently, it employs +selective knowledge masking to train the model effectively. Remarkably, DSTEA +only requires pre-training without the direct infusion of extra knowledge into +the DST model. This approach resulted in substantial performance improvements +of four robust DST models on MultiWOZ 2.0, 2.1, and 2.2, with joint goal +accuracy witnessing an increase of up to 2.69% (from 52.41% to 55.10%). Further +validation of DSTEA's efficacy was provided through comparative experiments +considering various entity types and different entity adaptive pre-training +configurations such as masking strategy and masking rate. + +
+
+
+
+
+ + ♻ ☆ SentimentGPT: Exploiting GPT for Advanced Sentiment Analysis and its + Departure from Current Machine Learning + + +
+ This study presents a thorough examination of various Generative Pretrained +Transformer (GPT) methodologies in sentiment analysis, specifically in the +context of Task 4 on the SemEval 2017 dataset. Three primary strategies are +employed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2) +fine-tuning GPT models, and 3) an inventive approach to embedding +classification. The research yields detailed comparative insights among these +strategies and individual GPT models, revealing their unique strengths and +potential limitations. Additionally, the study compares these GPT-based +methodologies with other current, high-performing models previously used with +the same dataset. The results illustrate the significant superiority of the GPT +approaches in terms of predictive performance, more than 22\% in F1-score +compared to the state-of-the-art. Further, the paper sheds light on common +challenges in sentiment analysis tasks, such as understanding context and +detecting sarcasm. It underscores the enhanced capabilities of the GPT models +to effectively handle these complexities. Taken together, these findings +highlight the promising potential of GPT models in sentiment analysis, setting +the stage for future research in this field. The code can be found at +https://github.com/DSAatUSU/SentimentGPT + +
+
+
+
+
+ + ♻ ☆ LaunchpadGPT: Language Model as Music Visualization Designer on + Launchpad + + +
+ Launchpad is a musical instrument that allows users to create and perform +music by pressing illuminated buttons. To assist and inspire the design of the +Launchpad light effect, and provide a more accessible approach for beginners to +create music visualization with this instrument, we proposed the LaunchpadGPT +model to generate music visualization designs on Launchpad automatically. Based +on the language model with excellent generation ability, our proposed +LaunchpadGPT takes an audio piece of music as input and outputs the lighting +effects of Launchpad-playing in the form of a video (Launchpad-playing video). +We collect Launchpad-playing videos and process them to obtain music and +corresponding video frame of Launchpad-playing as prompt-completion pairs, to +train the language model. The experiment result shows the proposed method can +create better music visualization than random generation methods and hold the +potential for a broader range of music visualization applications. Our code is +available at https://github.com/yunlong10/LaunchpadGPT/. + +
+
+ comment: Accepted by International Computer Music Conference (ICMC) 2023 +
+
+
+
+
+ + ♻ ☆ MGR: Multi-generator Based Rationalization ACL 2023 + + +
+ Rationalization is to employ a generator and a predictor to construct a +self-explaining NLP model in which the generator selects a subset of +human-intelligible pieces of the input text to the following predictor. +However, rationalization suffers from two key challenges, i.e., spurious +correlation and degeneration, where the predictor overfits the spurious or +meaningless pieces solely selected by the not-yet well-trained generator and in +turn deteriorates the generator. Although many studies have been proposed to +address the two challenges, they are usually designed separately and do not +take both of them into account. In this paper, we propose a simple yet +effective method named MGR to simultaneously solve the two problems. The key +idea of MGR is to employ multiple generators such that the occurrence stability +of real pieces is improved and more meaningful pieces are delivered to the +predictor. Empirically, we show that MGR improves the F1 score by up to 20.9% +as compared to state-of-the-art methods. Codes are available at +https://github.com/jugechengzi/Rationalization-MGR . + +
+
+ comment: ACL 2023, oral presentation. Fixed some typos and clarified some + implementation details. arXiv admin note: text overlap with arXiv:2209.08285 +
+
+
+
+
+ + ♻ ☆ Syllable Discovery and Cross-Lingual Generalization in a Visually + Grounded, Self-Supervised Speech Model + + +
+ In this paper, we show that representations capturing syllabic units emerge +when training a self-supervised speech model with a visually-grounded training +objective. We demonstrate that a nearly identical model architecture (HuBERT) +trained with a masked language modeling loss does not exhibit this same +ability, suggesting that the visual grounding objective is responsible for the +emergence of this phenomenon. We propose the use of a minimum cut algorithm to +automatically predict syllable boundaries in speech, followed by a 2-stage +clustering method to group identical syllables together. We show that our model +not only outperforms a state-of-the-art syllabic segmentation method on the +language it was trained on (English), but also generalizes in a zero-shot +fashion to Estonian. Finally, we show that the same model is capable of +zero-shot generalization for a word segmentation task on 4 other languages from +the Zerospeech Challenge, in some cases beating the previous state-of-the-art. + +
+
+ comment: Interspeech 2023. Code & Model: + https://github.com/jasonppy/syllable-discovery +
+
+
+
+
+ + ♻ ☆ DetectGPT: Zero-Shot Machine-Generated Text Detection using Probability + Curvature ICML 2023 + + +
+ The increasing fluency and widespread usage of large language models (LLMs) +highlight the desirability of corresponding tools aiding detection of +LLM-generated text. In this paper, we identify a property of the structure of +an LLM's probability function that is useful for such detection. Specifically, +we demonstrate that text sampled from an LLM tends to occupy negative curvature +regions of the model's log probability function. Leveraging this observation, +we then define a new curvature-based criterion for judging if a passage is +generated from a given LLM. This approach, which we call DetectGPT, does not +require training a separate classifier, collecting a dataset of real or +generated passages, or explicitly watermarking generated text. It uses only log +probabilities computed by the model of interest and random perturbations of the +passage from another generic pre-trained language model (e.g., T5). We find +DetectGPT is more discriminative than existing zero-shot methods for model +sample detection, notably improving detection of fake news articles generated +by 20B parameter GPT-NeoX from 0.81 AUROC for the strongest zero-shot baseline +to 0.95 AUROC for DetectGPT. See https://ericmitchell.ai/detectgpt for code, +data, and other project information. + +
+
+ comment: ICML 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 46 + +
+
+
+ + ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation ICCV 2023 + + +
+ Federated learning (FL) is a promising approach for enhancing data privacy +preservation, particularly for authentication systems. However, limited round +communications, scarce representation, and scalability pose significant +challenges to its deployment, hindering its full potential. In this paper, we +propose 'ProtoFL', Prototypical Representation Distillation based unsupervised +Federated Learning to enhance the representation power of a global model and +reduce round communication costs. Additionally, we introduce a local one-class +classifier based on normalizing flows to improve performance with limited data. +Our study represents the first investigation of using FL to improve one-class +classification performance. We conduct extensive experiments on five widely +used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and +Keystroke-Dynamics, to demonstrate the superior performance of our proposed +framework over previous methods in the literature. + +
+
+ comment: Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed + equally to this work +
+
+
+
+
+ + ☆ EnTri: Ensemble Learning with Tri-level Representations for Explainable + Scene Recognition + + +
+ Scene recognition based on deep-learning has made significant progress, but +there are still limitations in its performance due to challenges posed by +inter-class similarities and intra-class dissimilarities. Furthermore, prior +research has primarily focused on improving classification accuracy, yet it has +given less attention to achieving interpretable, precise scene classification. +Therefore, we are motivated to propose EnTri, an ensemble scene recognition +framework that employs ensemble learning using a hierarchy of visual features. +EnTri represents features at three distinct levels of detail: pixel-level, +semantic segmentation-level, and object class and frequency level. By +incorporating distinct feature encoding schemes of differing complexity and +leveraging ensemble strategies, our approach aims to improve classification +accuracy while enhancing transparency and interpretability via visual and +textual explanations. To achieve interpretability, we devised an extension +algorithm that generates both visual and textual explanations highlighting +various properties of a given scene that contribute to the final prediction of +its category. This includes information about objects, statistics, spatial +layout, and textural details. Through experiments on benchmark scene +classification datasets, EnTri has demonstrated superiority in terms of +recognition accuracy, achieving competitive performance compared to +state-of-the-art approaches, with an accuracy of 87.69%, 75.56%, and 99.17% on +the MIT67, SUN397, and UIUC8 datasets, respectively. + +
+
+ comment: Submitted to Pattern Recognition journal +
+
+
+
+
+ + ☆ SwIPE: Efficient and Robust Medical Image Segmentation with Implicit + Patch Embeddings MICCAI'23 + + +
+ Modern medical image segmentation methods primarily use discrete +representations in the form of rasterized masks to learn features and generate +predictions. Although effective, this paradigm is spatially inflexible, scales +poorly to higher-resolution images, and lacks direct understanding of object +shapes. To address these limitations, some recent works utilized implicit +neural representations (INRs) to learn continuous representations for +segmentation. However, these methods often directly adopted components designed +for 3D shape reconstruction. More importantly, these formulations were also +constrained to either point-based or global contexts, lacking contextual +understanding or local fine-grained details, respectively--both critical for +accurate segmentation. To remedy this, we propose a novel approach, SwIPE +(Segmentation with Implicit Patch Embeddings), that leverages the advantages of +INRs and predicts shapes at the patch level--rather than at the point level or +image level--to enable both accurate local boundary delineation and global +shape coherence. Extensive evaluations on two tasks (2D polyp segmentation and +3D abdominal organ segmentation) show that SwIPE significantly improves over +recent implicit approaches and outperforms state-of-the-art discrete methods +with over 10x fewer parameters. Our method also demonstrates superior data +efficiency and improved robustness to data shifts across image resolutions and +datasets. Code is available on Github. + +
+
+ comment: Accepted to 2023 International Conference on Medical Image Computing + and Computer Assisted Intervention (MICCAI'23) +
+
+
+
+
+ + ☆ Augmented Box Replay: Overcoming Foreground Shift for Incremental Object + Detection + + +
+ In incremental learning, replaying stored samples from previous tasks +together with current task samples is one of the most efficient approaches to +address catastrophic forgetting. However, unlike incremental classification, +image replay has not been successfully applied to incremental object detection +(IOD). In this paper, we identify the overlooked problem of foreground shift as +the main reason for this. Foreground shift only occurs when replaying images of +previous tasks and refers to the fact that their background might contain +foreground objects of the current task. To overcome this problem, a novel and +efficient Augmented Box Replay (ABR) method is developed that only stores and +replays foreground objects and thereby circumvents the foreground shift +problem. In addition, we propose an innovative Attentive RoI Distillation loss +that uses spatial attention from region-of-interest (RoI) features to constrain +current model to focus on the most important information from old model. ABR +significantly reduces forgetting of previous classes while maintaining high +plasticity in current classes. Moreover, it considerably reduces the storage +requirements when compared to standard image replay. Comprehensive experiments +on Pascal-VOC and COCO datasets support the state-of-the-art performance of our +model. + +
+
+
+
+
+ + ☆ TransNet: Transparent Object Manipulation Through Category-Level Pose + Estimation + + +
+ Transparent objects present multiple distinct challenges to visual perception +systems. First, their lack of distinguishing visual features makes transparent +objects harder to detect and localize than opaque objects. Even humans find +certain transparent surfaces with little specular reflection or refraction, +like glass doors, difficult to perceive. A second challenge is that depth +sensors typically used for opaque object perception cannot obtain accurate +depth measurements on transparent surfaces due to their unique reflective +properties. Stemming from these challenges, we observe that transparent object +instances within the same category, such as cups, look more similar to each +other than to ordinary opaque objects of that same category. Given this +observation, the present paper explores the possibility of category-level +transparent object pose estimation rather than instance-level pose estimation. +We propose \textit{\textbf{TransNet}}, a two-stage pipeline that estimates +category-level transparent object pose using localized depth completion and +surface normal estimation. TransNet is evaluated in terms of pose estimation +accuracy on a large-scale transparent object dataset and compared to a +state-of-the-art category-level pose estimation approach. Results from this +comparison demonstrate that TransNet achieves improved pose estimation accuracy +on transparent objects. Moreover, we use TransNet to build an autonomous +transparent object manipulation system for robotic pick-and-place and pouring +tasks. + +
+
+
+
+
+ + ☆ Iterative Robust Visual Grounding with Masked Reference based + Centerpoint Supervision + + +
+ Visual Grounding (VG) aims at localizing target objects from an image based +on given expressions and has made significant progress with the development of +detection and vision transformer. However, existing VG methods tend to generate +false-alarm objects when presented with inaccurate or irrelevant descriptions, +which commonly occur in practical applications. Moreover, existing methods fail +to capture fine-grained features, accurate localization, and sufficient context +comprehension from the whole image and textual descriptions. To address both +issues, we propose an Iterative Robust Visual Grounding (IR-VG) framework with +Masked Reference based Centerpoint Supervision (MRCS). The framework introduces +iterative multi-level vision-language fusion (IMVF) for better alignment. We +use MRCS to ahieve more accurate localization with point-wised feature +supervision. Then, to improve the robustness of VG, we also present a +multi-stage false-alarm sensitive decoder (MFSD) to prevent the generation of +false-alarm objects when presented with inaccurate expressions. The proposed +framework is evaluated on five regular VG datasets and two newly constructed +robust VG datasets. Extensive experiments demonstrate that IR-VG achieves new +state-of-the-art (SOTA) results, with improvements of 25\% and 10\% compared to +existing SOTA approaches on the two newly proposed robust VG datasets. +Moreover, the proposed framework is also verified effective on five regular VG +datasets. Codes and models will be publicly at +https://github.com/cv516Buaa/IR-VG. + +
+
+
+
+
+ + ☆ ComPtr: Towards Diverse Bi-source Dense Prediction Tasks via A Simple + yet General Complementary Transformer + + +
+ Deep learning (DL) has advanced the field of dense prediction, while +gradually dissolving the inherent barriers between different tasks. However, +most existing works focus on designing architectures and constructing visual +cues only for the specific task, which ignores the potential uniformity +introduced by the DL paradigm. In this paper, we attempt to construct a novel +\underline{ComP}lementary \underline{tr}ansformer, \textbf{ComPtr}, for diverse +bi-source dense prediction tasks. Specifically, unlike existing methods that +over-specialize in a single task or a subset of tasks, ComPtr starts from the +more general concept of bi-source dense prediction. Based on the basic +dependence on information complementarity, we propose consistency enhancement +and difference awareness components with which ComPtr can evacuate and collect +important visual semantic cues from different image sources for diverse tasks, +respectively. ComPtr treats different inputs equally and builds an efficient +dense interaction model in the form of sequence-to-sequence on top of the +transformer. This task-generic design provides a smooth foundation for +constructing the unified model that can simultaneously deal with various +bi-source information. In extensive experiments across several representative +vision tasks, i.e. remote sensing change detection, RGB-T crowd counting, +RGB-D/T salient object detection, and RGB-D semantic segmentation, the proposed +method consistently obtains favorable performance. The code will be available +at \url{https://github.com/lartpang/ComPtr}. + +
+
+
+
+
+ + ☆ ResShift: Efficient Diffusion Model for Image Super-resolution by + Residual Shifting + + +
+ Diffusion-based image super-resolution (SR) methods are mainly limited by the +low inference speed due to the requirements of hundreds or even thousands of +sampling steps. Existing acceleration sampling techniques inevitably sacrifice +performance to some extent, leading to over-blurry SR results. To address this +issue, we propose a novel and efficient diffusion model for SR that +significantly reduces the number of diffusion steps, thereby eliminating the +need for post-acceleration during inference and its associated performance +deterioration. Our method constructs a Markov chain that transfers between the +high-resolution image and the low-resolution image by shifting the residual +between them, substantially improving the transition efficiency. Additionally, +an elaborate noise schedule is developed to flexibly control the shifting speed +and the noise strength during the diffusion process. Extensive experiments +demonstrate that the proposed method obtains superior or at least comparable +performance to current state-of-the-art methods on both synthetic and +real-world datasets, even only with 15 sampling steps. Our code and model are +available at https://github.com/zsyOAOA/ResShift. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect + Spurious Correlations? + + +
+ While deep neural network models offer unmatched classification performance, +they are prone to learning spurious correlations in the data. Such dependencies +on confounding information can be difficult to detect using performance metrics +if the test data comes from the same distribution as the training data. +Interpretable ML methods such as post-hoc explanations or inherently +interpretable classifiers promise to identify faulty model reasoning. However, +there is mixed evidence whether many of these techniques are actually able to +do so. In this paper, we propose a rigorous evaluation strategy to assess an +explanation technique's ability to correctly identify spurious correlations. +Using this strategy, we evaluate five post-hoc explanation techniques and one +inherently interpretable method for their ability to detect three types of +artificially added confounders in a chest x-ray diagnosis task. We find that +the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net +provide the best performance and can be used to reliably identify faulty model +behavior. + +
+
+
+
+
+ + ☆ Towards Generic and Controllable Attacks Against Object Detection + + +
+ Existing adversarial attacks against Object Detectors (ODs) suffer from two +inherent limitations. Firstly, ODs have complicated meta-structure designs, +hence most advanced attacks for ODs concentrate on attacking specific +detector-intrinsic structures, which makes it hard for them to work on other +detectors and motivates us to design a generic attack against ODs. Secondly, +most works against ODs make Adversarial Examples (AEs) by generalizing +image-level attacks from classification to detection, which brings redundant +computations and perturbations in semantically meaningless areas (e.g., +backgrounds) and leads to an emergency for seeking controllable attacks for +ODs. To this end, we propose a generic white-box attack, LGP (local +perturbations with adaptively global attacks), to blind mainstream object +detectors with controllable perturbations. For a detector-agnostic attack, LGP +tracks high-quality proposals and optimizes three heterogeneous losses +simultaneously. In this way, we can fool the crucial components of ODs with a +part of their outputs without the limitations of specific structures. Regarding +controllability, we establish an object-wise constraint that exploits +foreground-background separation adaptively to induce the attachment of +perturbations to foregrounds. Experimentally, the proposed LGP successfully +attacked sixteen state-of-the-art object detectors on MS-COCO and DOTA +datasets, with promising imperceptibility and transferability obtained. Codes +are publicly released in https://github.com/liguopeng0923/LGP.git + +
+
+
+
+
+ + ☆ Rapid detection of soil carbonates by means of NIR spectroscopy, deep + learning methods and phase quantification by powder Xray diffraction + + +
+ Soil NIR spectral absorbance/reflectance libraries are utilized towards +improving agricultural production and analysis of soil properties which are key +prerequisite for agroecological balance and environmental sustainability. +Carbonates in particular, represent a soil property which is mostly affected +even by mild, let alone extreme, changes of environmental conditions during +climate change. In this study we propose a rapid and efficient way to predict +carbonates content in soil by means of FT NIR reflectance spectroscopy and by +use of deep learning methods. We exploited multiple machine learning methods, +such as: 1) a MLP Regressor and 2) a CNN and compare their performance with +other traditional ML algorithms such as PLSR, Cubist and SVM on the combined +dataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples +reflectance spectra collected nationwide, and LUCAS TopSoil (European Soil +Library) which contains soil sample absorbance spectra from all over the +European Union, and use them to predict carbonate content on never before seen +soil samples. Soil samples in KSSL and in TopSoil spectral libraries were +acquired in the spectral region of visNIR, however in this study, only the NIR +spectral region was utilized. Quantification of carbonates by means of Xray +Diffraction is in good agreement with the volumetric method and the MLP +prediction. Our work contributes to rapid carbonates content prediction in soil +samples in cases where: 1) no volumetric method is available and 2) only NIR +spectra absorbance data are available. Up till now and to the best of our +knowledge, there exists no other study, that presents a prediction model +trained on such an extensive dataset with such promising results on unseen +data, undoubtedly supporting the notion that deep learning models present +excellent prediction tools for soil carbonates content. + +
+
+ comment: 39 pages, 5 figures +
+
+
+
+
+ + ☆ Learning Navigational Visual Representations with Semantic Map + Supervision + + +
+ Being able to perceive the semantics and the spatial structure of the +environment is essential for visual navigation of a household robot. However, +most existing works only employ visual backbones pre-trained either with +independent images for classification or with self-supervised learning methods +to adapt to the indoor navigation domain, neglecting the spatial relationships +that are essential to the learning of navigation. Inspired by the behavior that +humans naturally build semantically and spatially meaningful cognitive maps in +their brains during navigation, in this paper, we propose a novel +navigational-specific visual representation learning method by contrasting the +agent's egocentric views and semantic maps (Ego$^2$-Map). We apply the visual +transformer as the backbone encoder and train the model with data collected +from the large-scale Habitat-Matterport3D environments. Ego$^2$-Map learning +transfers the compact and rich information from a map, such as objects, +structure and transition, to the agent's egocentric representations for +navigation. Experiments show that agents using our learned representations on +object-goal navigation outperform recent visual pre-training methods. Moreover, +our representations significantly improve vision-and-language navigation in +continuous environments for both high-level and low-level action spaces, +achieving new state-of-the-art results of 47% SR and 41% SPL on the test +server. + +
+
+
+
+
+ + ☆ ES2Net: An Efficient Spectral-Spatial Network for Hyperspectral Image + Change Detection + + +
+ Hyperspectral image change detection (HSI-CD) aims to identify the +differences in bitemporal HSIs. To mitigate spectral redundancy and improve the +discriminativeness of changing features, some methods introduced band selection +technology to select bands conducive for CD. However, these methods are limited +by the inability to end-to-end training with the deep learning-based feature +extractor and lack considering the complex nonlinear relationship among bands. +In this paper, we propose an end-to-end efficient spectral-spatial change +detection network (ES2Net) to address these issues. Specifically, we devised a +learnable band selection module to automatically select bands conducive to CD. +It can be jointly optimized with a feature extraction network and capture the +complex nonlinear relationships among bands. Moreover, considering the large +spatial feature distribution differences among different bands, we design the +cluster-wise spatial attention mechanism that assigns a spatial attention +factor to each individual band to individually improve the feature +discriminativeness for each band. Experiments on three widely used HSI-CD +datasets demonstrate the effectiveness and superiority of this method compared +with other state-of-the-art methods. + +
+
+
+
+
+ + ☆ Development of pericardial fat count images using a combination of three + different deep-learning models + + +
+ Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat +surrounding the heart, promotes the development of coronary artery disease by +inducing inflammation of the coronary arteries. For evaluating PF, this study +aimed to generate pericardial fat count images (PFCIs) from chest radiographs +(CXRs) using a dedicated deep-learning model. + Materials and Methods: The data of 269 consecutive patients who underwent +coronary computed tomography (CT) were reviewed. Patients with metal implants, +pleural effusion, history of thoracic surgery, or that of malignancy were +excluded. Thus, the data of 191 patients were used. PFCIs were generated from +the projection of three-dimensional CT images, where fat accumulation was +represented by a high pixel value. Three different deep-learning models, +including CycleGAN, were combined in the proposed method to generate PFCIs from +CXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for +comparison with the proposed method. To evaluate the image quality of the +generated PFCIs, structural similarity index measure (SSIM), mean squared error +(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the +proposed method and (ii) the PFCI generated using the single model were +compared. + Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and +0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504, +respectively, for the single CycleGAN-based model. + Conclusion: PFCIs generated from CXRs with the proposed model showed better +performance than those with the single model. PFCI evaluation without CT may be +possible with the proposed method. + +
+
+
+
+
+ + ☆ Building Extraction from Remote Sensing Images via an Uncertainty-Aware + Network + + +
+ Building extraction aims to segment building pixels from remote sensing +images and plays an essential role in many applications, such as city planning +and urban dynamic monitoring. Over the past few years, deep learning methods +with encoder-decoder architectures have achieved remarkable performance due to +their powerful feature representation capability. Nevertheless, due to the +varying scales and styles of buildings, conventional deep learning models +always suffer from uncertain predictions and cannot accurately distinguish the +complete footprints of the building from the complex distribution of ground +objects, leading to a large degree of omission and commission. In this paper, +we realize the importance of uncertain prediction and propose a novel and +straightforward Uncertainty-Aware Network (UANet) to alleviate this problem. To +verify the performance of our proposed UANet, we conduct extensive experiments +on three public building datasets, including the WHU building dataset, the +Massachusetts building dataset, and the Inria aerial image dataset. Results +demonstrate that the proposed UANet outperforms other state-of-the-art +algorithms by a large margin. + +
+
+
+
+
+ + ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC + + +
+ Image outlier detection (OD) is crucial for ensuring the quality and accuracy +of image datasets used in computer vision tasks. The majority of OD algorithms, +however, have not been targeted toward image data. Consequently, the results of +applying such algorithms to images are often suboptimal. In this work, we +propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for +images. By comparing images in a RANSAC-based approach, our algorithm +automatically predicts the outlier score of each image without additional +training or label information. We evaluate RANSAC-NN against state-of-the-art +OD algorithms on 15 diverse datasets. Without any hyperparameter tuning, +RANSAC-NN consistently performs favorably in contrast to other algorithms in +almost every dataset category. Furthermore, we provide a detailed analysis to +understand each RANSAC-NN component, and we demonstrate its potential +applications in image mislabeled detection. Code for RANSAC-NN is provided at +https://github.com/mxtsai/ransac-nn + +
+
+ comment: 19 pages, 18 figures +
+
+
+
+
+ + ☆ Hybrid-CSR: Coupling Explicit and Implicit Shape Representation for + Cortical Surface Reconstruction + + +
+ We present Hybrid-CSR, a geometric deep-learning model that combines explicit +and implicit shape representations for cortical surface reconstruction. +Specifically, Hybrid-CSR begins with explicit deformations of template meshes +to obtain coarsely reconstructed cortical surfaces, based on which the oriented +point clouds are estimated for the subsequent differentiable poisson surface +reconstruction. By doing so, our method unifies explicit (oriented point +clouds) and implicit (indicator function) cortical surface reconstruction. +Compared to explicit representation-based methods, our hybrid approach is more +friendly to capture detailed structures, and when compared with implicit +representation-based methods, our method can be topology aware because of +end-to-end training with a mesh-based deformation module. In order to address +topology defects, we propose a new topology correction pipeline that relies on +optimization-based diffeomorphic surface registration. Experimental results on +three brain datasets show that our approach surpasses existing implicit and +explicit cortical surface reconstruction methods in numeric metrics in terms of +accuracy, regularity, and consistency. + +
+
+
+
+
+ + ☆ Simultaneous temperature estimation and nonuniformity correction from + multiple frames + + +
+ Infrared (IR) cameras are widely used for temperature measurements in various +applications, including agriculture, medicine, and security. Low-cost IR camera +have an immense potential to replace expansive radiometric cameras in these +applications, however low-cost microbolometer-based IR cameras are prone to +spatially-variant nonuniformity and to drift in temperature measurements, which +limits their usability in practical scenarios. + To address these limitations, we propose a novel approach for simultaneous +temperature estimation and nonuniformity correction from multiple frames +captured by low-cost microbolometer-based IR cameras. We leverage the physical +image acquisition model of the camera and incorporate it into a deep learning +architecture called kernel estimation networks (KPN), which enables us to +combine multiple frames despite imperfect registration between them. We also +propose a novel offset block that incorporates the ambient temperature into the +model and enables us to estimate the offset of the camera, which is a key +factor in temperature estimation. + Our findings demonstrate that the number of frames has a significant impact +on the accuracy of temperature estimation and nonuniformity correction. +Moreover, our approach achieves a significant improvement in performance +compared to vanilla KPN, thanks to the offset block. The method was tested on +real data collected by a low-cost IR camera mounted on a UAV, showing only a +small average error of $0.27^\circ C-0.54^\circ C$ relative to costly +scientific-grade radiometric cameras. + Our method provides an accurate and efficient solution for simultaneous +temperature estimation and nonuniformity correction, which has important +implications for a wide range of practical applications. + +
+
+
+
+
+ + ☆ TransHuman: A Transformer-based Human Representation for Generalizable + Neural Human Rendering ICCV 2023 + + +
+ In this paper, we focus on the task of generalizable neural human rendering +which trains conditional Neural Radiance Fields (NeRF) from multi-view videos +of different characters. To handle the dynamic human motion, previous methods +have primarily used a SparseConvNet (SPC)-based human representation to process +the painted SMPL. However, such SPC-based representation i) optimizes under the +volatile observation space which leads to the pose-misalignment between +training and inference stages, and ii) lacks the global relationships among +human parts that is critical for handling the incomplete painted SMPL. Tackling +these issues, we present a brand-new framework named TransHuman, which learns +the painted SMPL under the canonical space and captures the global +relationships between human parts with transformers. Specifically, TransHuman +is mainly composed of Transformer-based Human Encoding (TransHE), Deformable +Partial Radiance Fields (DPaRF), and Fine-grained Detail Integration (FDI). +TransHE first processes the painted SMPL under the canonical space via +transformers for capturing the global relationships between human parts. Then, +DPaRF binds each output token with a deformable radiance field for encoding the +query point under the observation space. Finally, the FDI is employed to +further integrate fine-grained information from reference images. Extensive +experiments on ZJU-MoCap and H36M show that our TransHuman achieves a +significantly new state-of-the-art performance with high efficiency. Project +page: https://pansanity666.github.io/TransHuman/ + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Downstream-agnostic Adversarial Examples ICCV '23 + + +
+ Self-supervised learning usually uses a large amount of unlabeled data to +pre-train an encoder which can be used as a general-purpose feature extractor, +such that downstream users only need to perform fine-tuning operations to enjoy +the benefit of "large model". Despite this promising prospect, the security of +pre-trained encoder has not been thoroughly investigated yet, especially when +the pre-trained encoder is publicly available for commercial use. + In this paper, we propose AdvEncoder, the first framework for generating +downstream-agnostic universal adversarial examples based on the pre-trained +encoder. AdvEncoder aims to construct a universal adversarial perturbation or +patch for a set of natural images that can fool all the downstream tasks +inheriting the victim pre-trained encoder. Unlike traditional adversarial +example works, the pre-trained encoder only outputs feature vectors rather than +classification labels. Therefore, we first exploit the high frequency component +information of the image to guide the generation of adversarial examples. Then +we design a generative attack framework to construct adversarial +perturbations/patches by learning the distribution of the attack surrogate +dataset to improve their attack success rates and transferability. Our results +show that an attacker can successfully attack downstream tasks without knowing +either the pre-training dataset or the downstream dataset. We also tailor four +defenses for pre-trained encoders, the results of which further prove the +attack ability of AdvEncoder. + +
+
+ comment: This paper has been accepted by the International Conference on + Computer Vision (ICCV '23, October 2--6, 2023, Paris, France) +
+
+
+
+
+ + ☆ FDCT: Fast Depth Completion for Transparent Objects + + +
+ Depth completion is crucial for many robotic tasks such as autonomous +driving, 3-D reconstruction, and manipulation. Despite the significant +progress, existing methods remain computationally intensive and often fail to +meet the real-time requirements of low-power robotic platforms. Additionally, +most methods are designed for opaque objects and struggle with transparent +objects due to the special properties of reflection and refraction. To address +these challenges, we propose a Fast Depth Completion framework for Transparent +objects (FDCT), which also benefits downstream tasks like object pose +estimation. To leverage local information and avoid overfitting issues when +integrating it with global information, we design a new fusion branch and +shortcuts to exploit low-level features and a loss function to suppress +overfitting. This results in an accurate and user-friendly depth rectification +framework which can recover dense depth estimation from RGB-D images alone. +Extensive experiments demonstrate that FDCT can run about 70 FPS with a higher +accuracy than the state-of-the-art methods. We also demonstrate that FDCT can +improve pose estimation in object grasping tasks. The source code is available +at https://github.com/Nonmy/FDCT + +
+
+ comment: 9pages,7figures +
+
+
+
+
+ + ☆ Context Perception Parallel Decoder for Scene Text Recognition + + +
+ Scene text recognition (STR) methods have struggled to attain high accuracy +and fast inference speed. Autoregressive (AR)-based STR model uses the +previously recognized characters to decode the next character iteratively. It +shows superiority in terms of accuracy. However, the inference speed is slow +also due to this iteration. Alternatively, parallel decoding (PD)-based STR +model infers all the characters in a single decoding pass. It has advantages in +terms of inference speed but worse accuracy, as it is difficult to build a +robust recognition context in such a pass. In this paper, we first present an +empirical study of AR decoding in STR. In addition to constructing a new AR +model with the top accuracy, we find out that the success of AR decoder lies +also in providing guidance on visual context perception rather than language +modeling as claimed in existing studies. As a consequence, we propose Context +Perception Parallel Decoder (CPPD) to decode the character sequence in a single +PD pass. CPPD devises a character counting module and a character ordering +module. Given a text instance, the former infers the occurrence count of each +character, while the latter deduces the character reading order and +placeholders. Together with the character prediction task, they construct a +context that robustly tells what the character sequence is and where the +characters appear, well mimicking the context conveyed by AR decoding. +Experiments on both English and Chinese benchmarks demonstrate that CPPD models +achieve highly competitive accuracy. Moreover, they run approximately 7x faster +than their AR counterparts, and are also among the fastest recognizers. The +code will be released soon. + +
+
+
+
+
+ + ☆ Building-road Collaborative Extraction from Remotely Sensed Images via + Cross-Interaction SP + + +
+ Buildings are the basic carrier of social production and human life; roads +are the links that interconnect social networks. Building and road information +has important application value in the frontier fields of regional coordinated +development, disaster prevention, auto-driving, etc. Mapping buildings and +roads from very high-resolution (VHR) remote sensing images have become a hot +research topic. However, the existing methods often ignore the strong spatial +correlation between roads and buildings and extract them in isolation. To fully +utilize the complementary advantages between buildings and roads, we propose a +building-road collaborative extraction method based on multi-task and +cross-scale feature interaction to improve the accuracy of both tasks in a +complementary way. A multi-task interaction module is proposed to interact +information across tasks and preserve the unique information of each task, +which tackle the seesaw phenomenon in multitask learning. By considering the +variation in appearance and structure between buildings and roads, a +cross-scale interaction module is designed to automatically learn the optimal +reception field for different tasks. Compared with many existing methods that +train each task individually, the proposed collaborative extraction method can +utilize the complementary advantages between buildings and roads by the +proposed inter-task and inter-scale feature interactions, and automatically +select the optimal reception field for different tasks. Experiments on a wide +range of urban and rural scenarios show that the proposed algorithm can achieve +building-road extraction with outstanding performance and efficiency. + +
+
+ comment: 34 pages,9 figures, submitted to ISPRS Journal of Photogrammetry and + Remote Sensing +
+
+
+
+
+ + ☆ ResWCAE: Biometric Pattern Image Denoising Using Residual + Wavelet-Conditioned Autoencoder + + +
+ The utilization of biometric authentication with pattern images is +increasingly popular in compact Internet of Things (IoT) devices. However, the +reliability of such systems can be compromised by image quality issues, +particularly in the presence of high levels of noise. While state-of-the-art +deep learning algorithms designed for generic image denoising have shown +promise, their large number of parameters and lack of optimization for unique +biometric pattern retrieval make them unsuitable for these devices and +scenarios. In response to these challenges, this paper proposes a lightweight +and robust deep learning architecture, the Residual Wavelet-Conditioned +Convolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD) +regularization, designed specifically for fingerprint image denoising. Res-WCAE +comprises two encoders - an image encoder and a wavelet encoder - and one +decoder. Residual connections between the image encoder and decoder are +leveraged to preserve fine-grained spatial features, where the bottleneck layer +conditioned on the compressed representation of features obtained from the +wavelet encoder using approximation and detail subimages in the +wavelet-transform domain. The effectiveness of Res-WCAE is evaluated against +several state-of-the-art denoising methods, and the experimental results +demonstrate that Res-WCAE outperforms these methods, particularly for heavily +degraded fingerprint images in the presence of high levels of noise. Overall, +Res-WCAE shows promise as a solution to the challenges faced by biometric +authentication systems in compact IoT devices. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Explainable Depression Detection via Head Motion Patterns + + +
+ While depression has been studied via multimodal non-verbal behavioural cues, +head motion behaviour has not received much attention as a biomarker. This +study demonstrates the utility of fundamental head-motion units, termed +\emph{kinemes}, for depression detection by adopting two distinct approaches, +and employing distinctive features: (a) discovering kinemes from head motion +data corresponding to both depressed patients and healthy controls, and (b) +learning kineme patterns only from healthy controls, and computing statistics +derived from reconstruction errors for both the patient and control classes. +Employing machine learning methods, we evaluate depression classification +performance on the \emph{BlackDog} and \emph{AVEC2013} datasets. Our findings +indicate that: (1) head motion patterns are effective biomarkers for detecting +depressive symptoms, and (2) explanatory kineme patterns consistent with prior +findings can be observed for the two classes. Overall, we achieve peak F1 +scores of 0.79 and 0.82, respectively, over BlackDog and AVEC2013 for binary +classification over episodic \emph{thin-slices}, and a peak F1 of 0.72 over +videos for AVEC2013. + +
+
+
+
+
+ + ☆ DQ-Det: Learning Dynamic Query Combinations for Transformer-based Object + Detection and Segmentation ICML 2023 + + +
+ Transformer-based detection and segmentation methods use a list of learned +detection queries to retrieve information from the transformer network and +learn to predict the location and category of one specific object from each +query. We empirically find that random convex combinations of the learned +queries are still good for the corresponding models. We then propose to learn a +convex combination with dynamic coefficients based on the high-level semantics +of the image. The generated dynamic queries, named modulated queries, better +capture the prior of object locations and categories in the different images. +Equipped with our modulated queries, a wide range of DETR-based models achieve +consistent and superior performance across multiple tasks including object +detection, instance segmentation, panoptic segmentation, and video instance +segmentation. + +
+
+ comment: 12 pages, 4 figures, ICML 2023 +
+
+
+
+
+ + ☆ Multi-Modal Machine Learning for Assessing Gaming Skills in Online + Streaming: A Case Study with CS:GO + + +
+ Online streaming is an emerging market that address much attention. Assessing +gaming skills from videos is an important task for streaming service providers +to discover talented gamers. Service providers require the information to offer +customized recommendation and service promotion to their customers. Meanwhile, +this is also an important multi-modal machine learning tasks since online +streaming combines vision, audio and text modalities. In this study we begin by +identifying flaws in the dataset and proceed to clean it manually. Then we +propose several variants of latest end-to-end models to learn joint +representation of multiple modalities. Through our extensive experimentation, +we demonstrate the efficacy of our proposals. Moreover, we identify that our +proposed models is prone to identifying users instead of learning meaningful +representations. We purpose future work to address the issue in the end. + +
+
+
+
+
+ + ☆ EchoGLAD: Hierarchical Graph Neural Networks for Left Ventricle Landmark + Detection on Echocardiograms MICCAI 2023 + + +
+ The functional assessment of the left ventricle chamber of the heart requires +detecting four landmark locations and measuring the internal dimension of the +left ventricle and the approximate mass of the surrounding muscle. The key +challenge of automating this task with machine learning is the sparsity of +clinical labels, i.e., only a few landmark pixels in a high-dimensional image +are annotated, leading many prior works to heavily rely on isotropic label +smoothing. However, such a label smoothing strategy ignores the anatomical +information of the image and induces some bias. To address this challenge, we +introduce an echocardiogram-based, hierarchical graph neural network (GNN) for +left ventricle landmark detection (EchoGLAD). Our main contributions are: 1) a +hierarchical graph representation learning framework for multi-resolution +landmark detection via GNNs; 2) induced hierarchical supervision at different +levels of granularity using a multi-level loss. We evaluate our model on a +public and a private dataset under the in-distribution (ID) and +out-of-distribution (OOD) settings. For the ID setting, we achieve the +state-of-the-art mean absolute errors (MAEs) of 1.46 mm and 1.86 mm on the two +datasets. Our model also shows better OOD generalization than prior works with +a testing MAE of 4.3 mm. + +
+
+ comment: To be published in MICCAI 2023 +
+
+
+
+
+ + ☆ ASCON: Anatomy-aware Supervised Contrastive Learning Framework for + Low-dose CT Denoising MICCAI 2023 + + +
+ While various deep learning methods have been proposed for low-dose computed +tomography (CT) denoising, most of them leverage the normal-dose CT images as +the ground-truth to supervise the denoising process. These methods typically +ignore the inherent correlation within a single CT image, especially the +anatomical semantics of human tissues, and lack the interpretability on the +denoising process. In this paper, we propose a novel Anatomy-aware Supervised +CONtrastive learning framework, termed ASCON, which can explore the anatomical +semantics for low-dose CT denoising while providing anatomical +interpretability. The proposed ASCON consists of two novel designs: an +efficient self-attention-based U-Net (ESAU-Net) and a multi-scale anatomical +contrastive network (MAC-Net). First, to better capture global-local +interactions and adapt to the high-resolution input, an efficient ESAU-Net is +introduced by using a channel-wise self-attention mechanism. Second, MAC-Net +incorporates a patch-wise non-contrastive module to capture inherent anatomical +information and a pixel-wise contrastive module to maintain intrinsic +anatomical consistency. Extensive experimental results on two public low-dose +CT denoising datasets demonstrate superior performance of ASCON over +state-of-the-art models. Remarkably, our ASCON provides anatomical +interpretability for low-dose CT denoising for the first time. Source code is +available at https://github.com/hao1635/ASCON. + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ☆ Expediting Building Footprint Segmentation from High-resolution Remote + Sensing Images via progressive lenient supervision + + +
+ The efficacy of building footprint segmentation from remotely sensed images +has been hindered by model transfer effectiveness. Many existing building +segmentation methods were developed upon the encoder-decoder architecture of +U-Net, in which the encoder is finetuned from the newly developed backbone +networks that are pre-trained on ImageNet. However, the heavy computational +burden of the existing decoder designs hampers the successful transfer of these +modern encoder networks to remote sensing tasks. Even the widely-adopted deep +supervision strategy fails to mitigate these challenges due to its invalid loss +in hybrid regions where foreground and background pixels are intermixed. In +this paper, we conduct a comprehensive evaluation of existing decoder network +designs for building footprint segmentation and propose an efficient framework +denoted as BFSeg to enhance learning efficiency and effectiveness. +Specifically, a densely-connected coarse-to-fine feature fusion decoder network +that facilitates easy and fast feature fusion across scales is proposed. +Moreover, considering the invalidity of hybrid regions in the down-sampled +ground truth during the deep supervision process, we present a lenient deep +supervision and distillation strategy that enables the network to learn proper +knowledge from deep supervision. Building upon these advancements, we have +developed a new family of building segmentation networks, which consistently +surpass prior works with outstanding performance and efficiency across a wide +range of newly developed encoder networks. The code will be released on +https://github.com/HaonanGuo/BFSeg-Efficient-Building-Footprint-Segmentation-Framework. + +
+
+ comment: 13 pages,8 figures. Submitted to IEEE Transactions on Neural Networks + and Learning Systems +
+
+
+
+
+ + ☆ LoLep: Single-View View Synthesis with Locally-Learned Planes and + Self-Attention Occlusion Inference ICCV 2023 + + +
+ We propose a novel method, LoLep, which regresses Locally-Learned planes from +a single RGB image to represent scenes accurately, thus generating better novel +views. Without the depth information, regressing appropriate plane locations is +a challenging problem. To solve this issue, we pre-partition the disparity +space into bins and design a disparity sampler to regress local offsets for +multiple planes in each bin. However, only using such a sampler makes the +network not convergent; we further propose two optimizing strategies that +combine with different disparity distributions of datasets and propose an +occlusion-aware reprojection loss as a simple yet effective geometric +supervision technique. We also introduce a self-attention mechanism to improve +occlusion inference and present a Block-Sampling Self-Attention (BS-SA) module +to address the problem of applying self-attention to large feature maps. We +demonstrate the effectiveness of our approach and generate state-of-the-art +results on different datasets. Compared to MINE, our approach has an LPIPS +reduction of 4.8%-9.0% and an RV reduction of 83.1%-84.7%. We also evaluate the +performance on real-world images and demonstrate the benefits. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ DeepCL: Deep Change Feature Learning on Remote Sensing Images in the + Metric Space + + +
+ Change detection (CD) is an important yet challenging task in the Earth +observation field for monitoring Earth surface dynamics. The advent of deep +learning techniques has recently propelled automatic CD into a technological +revolution. Nevertheless, deep learning-based CD methods are still plagued by +two primary issues: 1) insufficient temporal relationship modeling and 2) +pseudo-change misclassification. To address these issues, we complement the +strong temporal modeling ability of metric learning with the prominent fitting +ability of segmentation and propose a deep change feature learning (DeepCL) +framework for robust and explainable CD. Firstly, we designed a hard +sample-aware contrastive loss, which reweights the importance of hard and +simple samples. This loss allows for explicit modeling of the temporal +correlation between bi-temporal remote sensing images. Furthermore, the modeled +temporal relations are utilized as knowledge prior to guide the segmentation +process for detecting change regions. The DeepCL framework is thoroughly +evaluated both theoretically and experimentally, demonstrating its superior +feature discriminability, resilience against pseudo changes, and adaptability +to a variety of CD algorithms. Extensive comparative experiments substantiate +the quantitative and qualitative superiority of DeepCL over state-of-the-art CD +approaches. + +
+
+ comment: 12 pages,7 figures, submitted to IEEE Transactions on Image + Processing +
+
+
+
+
+ + ♻ ☆ Magic123: One Image to High-Quality 3D Object Generation Using Both 2D + and 3D Diffusion Priors + + +
+ We present Magic123, a two-stage coarse-to-fine approach for high-quality, +textured 3D meshes generation from a single unposed image in the wild using +both2D and 3D priors. In the first stage, we optimize a neural radiance field +to produce a coarse geometry. In the second stage, we adopt a memory-efficient +differentiable mesh representation to yield a high-resolution mesh with a +visually appealing texture. In both stages, the 3D content is learned through +reference view supervision and novel views guided by a combination of 2D and 3D +diffusion priors. We introduce a single trade-off parameter between the 2D and +3D priors to control exploration (more imaginative) and exploitation (more +precise) of the generated geometry. Additionally, we employ textual inversion +and monocular depth regularization to encourage consistent appearances across +views and to prevent degenerate solutions, respectively. Magic123 demonstrates +a significant improvement over previous image-to-3D techniques, as validated +through extensive experiments on synthetic benchmarks and diverse real-world +images. Our code, models, and generated 3D assets are available at +https://github.com/guochengqian/Magic123. + +
+
+ comment: webpage: https://guochengqian.github.io/project/magic123/ +
+
+
+
+
+ + ♻ ☆ TokenFlow: Consistent Diffusion Features for Consistent Video Editing + + +
+ The generative AI revolution has recently expanded to videos. Nevertheless, +current state-of-the-art video models are still lagging behind image models in +terms of visual quality and user control over the generated content. In this +work, we present a framework that harnesses the power of a text-to-image +diffusion model for the task of text-driven video editing. Specifically, given +a source video and a target text-prompt, our method generates a high-quality +video that adheres to the target text, while preserving the spatial layout and +motion of the input video. Our method is based on a key observation that +consistency in the edited video can be obtained by enforcing consistency in the +diffusion feature space. We achieve this by explicitly propagating diffusion +features based on inter-frame correspondences, readily available in the model. +Thus, our framework does not require any training or fine-tuning, and can work +in conjunction with any off-the-shelf text-to-image editing method. We +demonstrate state-of-the-art editing results on a variety of real-world videos. +Webpage: https://diffusion-tokenflow.github.io/ + +
+
+
+
+
+ + ♻ ☆ What's Wrong with the Absolute Trajectory Error? + + +
+ One of the limitations of the commonly used Absolute Trajectory Error (ATE) +is that it is highly sensitive to outliers. As a result, in the presence of +just a few outliers, it often fails to reflect the varying accuracy as the +inlier trajectory error or the number of outliers varies. In this work, we +propose an alternative error metric for evaluating the accuracy of the +reconstructed camera trajectory. Our metric, named Discernible Trajectory Error +(DTE), is computed in five steps: (1) Shift the ground-truth and estimated +trajectories such that both of their geometric medians are located at the +origin. (2) Rotate the estimated trajectory such that it minimizes the sum of +geodesic distances between the corresponding camera orientations. (3) Scale the +estimated trajectory such that the median distance of the cameras to their +geometric median is the same as that of the ground truth. (4) Compute, +winsorize and normalize the distances between the corresponding cameras. (5) +Obtain the DTE by taking the average of the mean and the root-mean-square (RMS) +of the resulting distances. This metric is an attractive alternative to the +ATE, in that it is capable of discerning the varying trajectory accuracy as the +inlier trajectory error or the number of outliers varies. Using the similar +idea, we also propose a novel rotation error metric, named Discernible Rotation +Error (DRE), which has similar advantages to the DTE. Furthermore, we propose a +simple yet effective method for calibrating the camera-to-marker rotation, +which is needed for the computation of our metrics. Our methods are verified +through extensive simulations. + +
+
+
+
+
+ + ♻ ☆ ELVIS: Empowering Locality of Vision Language Pre-training with + Intra-modal Similarity + + +
+ Deep learning has shown great potential in assisting radiologists in reading +chest X-ray (CXR) images, but its need for expensive annotations for improving +performance prevents widespread clinical application. Visual language +pre-training (VLP) can alleviate the burden and cost of annotation by +leveraging routinely generated reports for radiographs, which exist in large +quantities as well as in paired form (image-text pairs). Additionally, +extensions to localization-aware VLPs are being proposed to address the needs +for accurate localization of abnormalities for computer-aided diagnosis (CAD) +in CXR. However, we find that the formulation proposed by locality-aware VLP +literature actually leads to a loss in spatial relationships required for +downstream localization tasks. Therefore, we propose Empowering Locality of VLP +with Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to +better preserve the locality within radiographs or reports, which enhances the +ability to comprehend location references in text reports. Our locality-aware +VLP method significantly outperforms state-of-the art baselines in multiple +segmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show +that ELVIS focuses well on regions of interest described in the report text +compared to prior approaches, allowing for enhanced interpretability. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ CLIPTER: Looking at the Bigger Picture in Scene Text Recognition ICCV 2023 + + +
+ Reading text in real-world scenarios often requires understanding the context +surrounding it, especially when dealing with poor-quality text. However, +current scene text recognizers are unaware of the bigger picture as they +operate on cropped text images. In this study, we harness the representative +capabilities of modern vision-language models, such as CLIP, to provide +scene-level information to the crop-based recognizer. We achieve this by fusing +a rich representation of the entire image, obtained from the vision-language +model, with the recognizer word-level features via a gated cross-attention +mechanism. This component gradually shifts to the context-enhanced +representation, allowing for stable fine-tuning of a pretrained recognizer. We +demonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP +TExt Recognition), on leading text recognition architectures and achieve +state-of-the-art results across multiple benchmarks. Furthermore, our analysis +highlights improved robustness to out-of-vocabulary words and enhanced +generalization in low-data regimes. + +
+
+ comment: Accepted for publication by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ RepViT: Revisiting Mobile CNN From ViT Perspective + + +
+ Recently, lightweight Vision Transformers (ViTs) demonstrate superior +performance and lower latency compared with lightweight Convolutional Neural +Networks (CNNs) on resource-constrained mobile devices. This improvement is +usually attributed to the multi-head self-attention module, which enables the +model to learn global representations. However, the architectural disparities +between lightweight ViTs and lightweight CNNs have not been adequately +examined. In this study, we revisit the efficient design of lightweight CNNs +and emphasize their potential for mobile devices. We incrementally enhance the +mobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by +integrating the efficient architectural choices of lightweight ViTs. This ends +up with a new family of pure lightweight CNNs, namely RepViT. Extensive +experiments show that RepViT outperforms existing state-of-the-art lightweight +ViTs and exhibits favorable latency in various vision tasks. On ImageNet, +RepViT achieves over 80\% top-1 accuracy with nearly 1ms latency on an iPhone +12, which is the first time for a lightweight model, to the best of our +knowledge. Our largest model, RepViT-M3, obtains 81.4\% accuracy with only +1.3ms latency. The code and trained models are available at +\url{https://github.com/jameslahm/RepViT}. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ From Sparse to Precise: A Practical Editing Approach for Intracardiac + Echocardiography Segmentation MICCAI 2023 + + +
+ Accurate and safe catheter ablation procedures for patients with atrial +fibrillation require precise segmentation of cardiac structures in Intracardiac +Echocardiography (ICE) imaging. Prior studies have suggested methods that +employ 3D geometry information from the ICE transducer to create a sparse ICE +volume by placing 2D frames in a 3D grid, enabling training of 3D segmentation +models. However, the resulting 3D masks from these models can be inaccurate and +may lead to serious clinical complications due to the sparse sampling in ICE +data, frames misalignment, and cardiac motion. To address this issue, we +propose an interactive editing framework that allows users to edit segmentation +output by drawing scribbles on a 2D frame. The user interaction is mapped to +the 3D grid and utilized to execute an editing step that modifies the +segmentation in the vicinity of the interaction while preserving the previous +segmentation away from the interaction. Furthermore, our framework accommodates +multiple edits to the segmentation output in a sequential manner without +compromising previous edits. This paper presents a novel loss function and a +novel evaluation metric specifically designed for editing. Results from +cross-validation and testing indicate that our proposed loss function +outperforms standard losses and training strategies in terms of segmentation +quality and following user input. Additionally, we show quantitatively and +qualitatively that subsequent edits do not compromise previous edits when using +our method, as opposed to standard segmentation losses. Overall, our approach +enhances the accuracy of the segmentation while avoiding undesired changes away +from user interactions and without compromising the quality of previously +edited regions, leading to better patient outcomes. + +
+
+ comment: Accepted to MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Towards Open Vocabulary Learning: A Survey + + +
+ In the field of visual scene understanding, deep neural networks have made +impressive advancements in various core tasks like segmentation, tracking, and +detection. However, most approaches operate on the close-set assumption, +meaning that the model can only identify pre-defined categories that are +present in the training set. Recently, open vocabulary settings were proposed +due to the rapid progress of vision language pre-training. These new approaches +seek to locate and recognize categories beyond the annotated label space. The +open vocabulary approach is more general, practical, and effective compared to +weakly supervised and zero-shot settings. This paper provides a thorough review +of open vocabulary learning, summarizing and analyzing recent developments in +the field. In particular, we begin by comparing it to related concepts such as +zero-shot learning, open-set recognition, and out-of-distribution detection. +Then, we review several closely related tasks in the case of segmentation and +detection, including long-tail problems, few-shot, and zero-shot settings. For +the method survey, we first present the basic knowledge of detection and +segmentation in close-set as the preliminary knowledge. Next, we examine +various scenarios in which open vocabulary learning is used, identifying common +design elements and core ideas. Then, we compare the recent detection and +segmentation approaches in commonly used datasets and benchmarks. Finally, we +conclude with insights, issues, and discussions regarding future research +directions. To our knowledge, this is the first comprehensive literature review +of open vocabulary learning. We keep tracing related works at +https://github.com/jianzongwu/Awesome-Open-Vocabulary. + +
+
+ comment: Project page: https://github.com/jianzongwu/Awesome-Open-Vocabulary +
+
+
+
+
+ + ♻ ☆ Betrayed by Captions: Joint Caption Grounding and Generation for Open + Vocabulary Instance Segmentation ICCV-2023 + + +
+ In this work, we focus on open vocabulary instance segmentation to expand a +segmentation model to classify and segment instance-level novel categories. +Previous approaches have relied on massive caption datasets and complex +pipelines to establish one-to-one mappings between image regions and words in +captions. However, such methods build noisy supervision by matching non-visible +words to image regions, such as adjectives and verbs. Meanwhile, context words +are also important for inferring the existence of novel objects as they show +high inter-correlations with novel categories. To overcome these limitations, +we devise a joint \textbf{Caption Grounding and Generation (CGG)} framework, +which incorporates a novel grounding loss that only focuses on matching object +nouns to improve learning efficiency. We also introduce a caption generation +head that enables additional supervision and contextual modeling as a +complementation to the grounding loss. Our analysis and results demonstrate +that grounding and generation components complement each other, significantly +enhancing the segmentation performance for novel classes. Experiments on the +COCO dataset with two settings: Open Vocabulary Instance Segmentation (OVIS) +and Open Set Panoptic Segmentation (OSPS) demonstrate the superiority of the +CGG. Specifically, CGG achieves a substantial improvement of 6.8% mAP for novel +classes without extra data on the OVIS task and 15% PQ improvements for novel +classes on the OSPS benchmark. + +
+
+ comment: ICCV-2023 +
+
+
+
+
+ + ♻ ☆ Towards Robust Referring Image Segmentation + + +
+ Referring Image Segmentation (RIS) is a fundamental vision-language task that +outputs object masks based on text descriptions. Many works have achieved +considerable progress for RIS, including different fusion method designs. In +this work, we explore an essential question, ``What if the text description is +wrong or misleading?'' For example, the described objects are not in the image. +We term such a sentence as a negative sentence. However, existing solutions for +RIS cannot handle such a setting. To this end, we propose a new formulation of +RIS, named Robust Referring Image Segmentation (R-RIS). It considers the +negative sentence inputs besides the regular positive text inputs. To +facilitate this new task, we create three R-RIS datasets by augmenting existing +RIS datasets with negative sentences and propose new metrics to evaluate both +types of inputs in a unified manner. Furthermore, we propose a new +transformer-based model, called RefSegformer, with a token-based vision and +language fusion module. Our design can be easily extended to our R-RIS setting +by adding extra blank tokens. Our proposed RefSegformer achieves +state-of-the-art results on both RIS and R-RIS datasets, establishing a solid +baseline for both settings. Our project page is at +\url{https://github.com/jianzongwu/robust-ref-seg}. + +
+
+ comment: update more results +
+
+
+
+
+ + ♻ ☆ PDPP:Projected Diffusion for Procedure Planning in Instructional Videos CVPR 2023 + + +
+ In this paper, we study the problem of procedure planning in instructional +videos, which aims to make goal-directed plans given the current visual +observations in unstructured real-life videos. Previous works cast this problem +as a sequence planning problem and leverage either heavy intermediate visual +observations or natural language instructions as supervision, resulting in +complex learning schemes and expensive annotation costs. In contrast, we treat +this problem as a distribution fitting problem. In this sense, we model the +whole intermediate action sequence distribution with a diffusion model (PDPP), +and thus transform the planning problem to a sampling process from this +distribution. In addition, we remove the expensive intermediate supervision, +and simply use task labels from instructional videos as supervision instead. +Our model is a U-Net based diffusion model, which directly samples action +sequences from the learned distribution with the given start and end +observations. Furthermore, we apply an efficient projection method to provide +accurate conditional guides for our model during the learning and sampling +process. Experiments on three datasets with different scales show that our PDPP +model can achieve the state-of-the-art performance on multiple metrics, even +without the task supervision. Code and trained models are available at +https://github.com/MCG-NJU/PDPP. + +
+
+ comment: Accepted as a highlight paper at CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Open-Vocabulary Affordance Detection in 3D Point Clouds IROS 2023 + + +
+ Affordance detection is a challenging problem with a wide variety of robotic +applications. Traditional affordance detection methods are limited to a +predefined set of affordance labels, hence potentially restricting the +adaptability of intelligent robots in complex and dynamic environments. In this +paper, we present the Open-Vocabulary Affordance Detection (OpenAD) method, +which is capable of detecting an unbounded number of affordances in 3D point +clouds. By simultaneously learning the affordance text and the point feature, +OpenAD successfully exploits the semantic relationships between affordances. +Therefore, our proposed method enables zero-shot detection and can be able to +detect previously unseen affordances without a single annotation example. +Intensive experimental results show that OpenAD works effectively on a wide +range of affordance detection setups and outperforms other baselines by a large +margin. Additionally, we demonstrate the practicality of the proposed OpenAD in +real-world robotic applications with a fast inference speed (~100ms). Our +project is available at https://openad2023.github.io. + +
+
+ comment: Accepted at The 2023 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2023) +
+
+
+
+
+ + ♻ ☆ Boosting Novel Category Discovery Over Domains with Soft Contrastive + Learning and All-in-One Classifier ICCV + + +
+ Unsupervised domain adaptation (UDA) has proven to be highly effective in +transferring knowledge from a label-rich source domain to a label-scarce target +domain. However, the presence of additional novel categories in the target +domain has led to the development of open-set domain adaptation (ODA) and +universal domain adaptation (UNDA). Existing ODA and UNDA methods treat all +novel categories as a single, unified unknown class and attempt to detect it +during training. However, we found that domain variance can lead to more +significant view-noise in unsupervised data augmentation, which affects the +effectiveness of contrastive learning (CL) and causes the model to be +overconfident in novel category discovery. To address these issues, a framework +named Soft-contrastive All-in-one Network (SAN) is proposed for ODA and UNDA +tasks. SAN includes a novel data-augmentation-based soft contrastive learning +(SCL) loss to fine-tune the backbone for feature transfer and a more +human-intuitive classifier to improve new class discovery capability. The SCL +loss weakens the adverse effects of the data augmentation view-noise problem +which is amplified in domain transfer tasks. The All-in-One (AIO) classifier +overcomes the overconfidence problem of current mainstream closed-set and +open-set classifiers. Visualization and ablation experiments demonstrate the +effectiveness of the proposed innovations. Furthermore, extensive experiment +results on ODA and UNDA show that SAN outperforms existing state-of-the-art +methods. + +
+
+ comment: Accepted by ICCV +
+
+
+
+
+ + ♻ ☆ GMA3D: Local-Global Attention Learning to Estimate Occluded Motions of + Scene Flow + + +
+ Scene flow represents the motion information of each point in the 3D point +clouds. It is a vital downstream method applied to many tasks, such as motion +segmentation and object tracking. However, there are always occlusion points +between two consecutive point clouds, whether from the sparsity data sampling +or real-world occlusion. In this paper, we focus on addressing occlusion issues +in scene flow by the semantic self-similarity and motion consistency of the +moving objects. We propose a GMA3D module based on the transformer framework, +which utilizes local and global semantic similarity to infer the motion +information of occluded points from the motion information of local and global +non-occluded points respectively, and then uses an offset aggregator to +aggregate them. Our module is the first to apply the transformer-based +architecture to gauge the scene flow occlusion problem on point clouds. +Experiments show that our GMA3D can solve the occlusion problem in the scene +flow, especially in the real scene. We evaluated the proposed method on the +occluded version of point cloud datasets and get state-of-the-art results on +the real scene KITTI dataset. To testify that GMA3D is still beneficial to +non-occluded scene flow, we also conducted experiments on non-occluded version +datasets and achieved promising performance on FlyThings3D and KITTI. The code +is available at https://anonymous.4open.science/r/GMA3D-E100. + +
+
+
+
+
+
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Interface Design to Mitigate Inflation in Recommender Systems + + +
+ Recommendation systems rely on user-provided data to learn about item quality +and provide personalized recommendations. An implicit assumption when +aggregating ratings into item quality is that ratings are strong indicators of +item quality. In this work, we test this assumption using data collected from a +music discovery application. Our study focuses on two factors that cause rating +inflation: heterogeneous user rating behavior and the dynamics of personalized +recommendations. We show that user rating behavior substantially varies by +user, leading to item quality estimates that reflect the users who rated an +item more than the item quality itself. Additionally, items that are more +likely to be shown via personalized recommendations can experience a +substantial increase in their exposure and potential bias toward them. To +mitigate these effects, we analyze the results of a randomized controlled trial +in which the rating interface was modified. The test resulted in a substantial +improvement in user rating behavior and a reduction in item quality inflation. +These findings highlight the importance of carefully considering the +assumptions underlying recommendation systems and designing interfaces that +encourage accurate rating behavior. + +
+
+
+
+
+ + ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC + + +
+ Image outlier detection (OD) is crucial for ensuring the quality and accuracy +of image datasets used in computer vision tasks. The majority of OD algorithms, +however, have not been targeted toward image data. Consequently, the results of +applying such algorithms to images are often suboptimal. In this work, we +propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for +images. By comparing images in a RANSAC-based approach, our algorithm +automatically predicts the outlier score of each image without additional +training or label information. We evaluate RANSAC-NN against state-of-the-art +OD algorithms on 15 diverse datasets. Without any hyperparameter tuning, +RANSAC-NN consistently performs favorably in contrast to other algorithms in +almost every dataset category. Furthermore, we provide a detailed analysis to +understand each RANSAC-NN component, and we demonstrate its potential +applications in image mislabeled detection. Code for RANSAC-NN is provided at +https://github.com/mxtsai/ransac-nn + +
+
+ comment: 19 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Recall, Robustness, and Lexicographic Evaluation + + +
+ Researchers use recall to evaluate rankings across a variety of retrieval, +recommendation, and machine learning tasks. While there is a colloquial +interpretation of recall in set-based evaluation, the research community is far +from a principled understanding of recall metrics for rankings. The lack of +principled understanding of or motivation for recall has resulted in criticism +amongst the retrieval community that recall is useful as a measure at all. In +this light, we reflect on the measurement of recall in rankings from a formal +perspective. Our analysis is composed of three tenets: recall, robustness, and +lexicographic evaluation. First, we formally define `recall-orientation' as +sensitivity to movement of the bottom-ranked relevant item. Second, we analyze +our concept of recall orientation from the perspective of robustness with +respect to possible searchers and content providers. Finally, we extend this +conceptual and theoretical treatment of recall by developing a practical +preference-based evaluation method based on lexicographic comparison. Through +extensive empirical analysis across 17 TREC tracks, we establish that our new +evaluation method, lexirecall, is correlated with existing recall metrics and +exhibits substantially higher discriminative power and stability in the +presence of missing labels. Our conceptual, theoretical, and empirical analysis +substantially deepens our understanding of recall and motivates its adoption +through connections to robustness and fairness. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Comparing Apples to Apples: Generating Aspect-Aware Comparative + Sentences from User Reviews + + +
+ It is time-consuming to find the best product among many similar +alternatives. Comparative sentences can help to contrast one item from others +in a way that highlights important features of an item that stand out. Given +reviews of one or multiple items and relevant item features, we generate +comparative review sentences to aid users to find the best fit. Specifically, +our model consists of three successive components in a transformer: (i) an item +encoding module to encode an item for comparison, (ii) a comparison generation +module that generates comparative sentences in an autoregressive manner, (iii) +a novel decoding method for user personalization. We show that our pipeline +generates fluent and diverse comparative sentences. We run experiments on the +relevance and fidelity of our generated sentences in a human evaluation study +and find that our algorithm creates comparative review sentences that are +relevant and truthful. + +
+
+
+
+
+ + ♻ ☆ Investigating the Factual Knowledge Boundary of Large Language Models + with Retrieval Augmentation + + +
+ Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require +a substantial amount of factual knowledge and often rely on external +information for assistance. Recently, large language models (LLMs) (e.g., +ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks +with world knowledge, including knowledge-intensive tasks. However, it remains +unclear how well LLMs are able to perceive their factual knowledge boundaries, +particularly how they behave when incorporating retrieval augmentation. In this +study, we present an initial analysis of the factual knowledge boundaries of +LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially, +we focus on three primary research questions and analyze them by examining QA +performance, priori judgement and posteriori judgement of LLMs. We show +evidence that LLMs possess unwavering confidence in their capabilities to +respond to questions and the accuracy of their responses. Furthermore, +retrieval augmentation proves to be an effective approach in enhancing LLMs' +awareness of knowledge boundaries, thereby improving their judgemental +abilities. Additionally, we also find that LLMs have a propensity to rely on +the provided retrieval results when formulating answers, while the quality of +these results significantly impacts their reliance. The code to reproduce this +work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary. + +
+
+
+
+
+ + ♻ ☆ ICPE: An Item Cluster-Wise Pareto-Efficient Framework for Recommendation + Debiasing + + +
+ Recommender system based on historical user-item interactions is of vital +importance for web-based services. However, the observed data used to train the +recommender model suffers from severe bias issues. Practically, the item +frequency distribution of the dataset is a highly skewed power-law +distribution. Interactions of a small fraction of head items account for almost +the whole training data. The normal training paradigm from such biased data +tends to repetitively generate recommendations from the head items, which +further exacerbates the biases and affects the exploration of potentially +interesting items from the niche set. In this work, we innovatively explore the +central theme of recommendation debiasing from an item cluster-wise +multi-objective optimization perspective. Aiming to balance the learning on +various item clusters that differ in popularity during the training process, we +propose a model-agnostic framework namely Item Cluster-Wise Pareto-Efficient +Recommendation (ICPE). In detail, we define our item cluster-wise optimization +target as the recommender model should balance all item clusters that differ in +popularity, thus we set the model learning on each item cluster as a unique +optimization objective. To achieve this goal, we first explore items' +popularity levels from a novel causal reasoning perspective. Then, we devise +popularity discrepancy-based bisecting clustering to separate the item +clusters. Next, we adaptively find the overall harmonious gradient direction +for cluster-wise optimization objectives from a Pareto-efficient solver. +Finally, in the prediction stage, we perform counterfactual inference to +further eliminate the impact of global propensity. Extensive experimental +results verify the superiorities of ICPE on overall recommendation performance +and biases elimination. + +
+
+
+
+
+
+
+
+ + Machine Learning 42 + +
+
+
+ + ☆ Information-theoretic Analysis of Test Data Sensitivity in Uncertainty + + +
+ Bayesian inference is often utilized for uncertainty quantification tasks. A +recent analysis by Xu and Raginsky 2022 rigorously decomposed the predictive +uncertainty in Bayesian inference into two uncertainties, called aleatoric and +epistemic uncertainties, which represent the inherent randomness in the +data-generating process and the variability due to insufficient data, +respectively. They analyzed those uncertainties in an information-theoretic +way, assuming that the model is well-specified and treating the model's +parameters as latent variables. However, the existing information-theoretic +analysis of uncertainty cannot explain the widely believed property of +uncertainty, known as the sensitivity between the test and training data. It +implies that when test data are similar to training data in some sense, the +epistemic uncertainty should become small. In this work, we study such +uncertainty sensitivity using our novel decomposition method for the predictive +uncertainty. Our analysis successfully defines such sensitivity using +information-theoretic quantities. Furthermore, we extend the existing analysis +of Bayesian meta-learning and show the novel sensitivities among tasks for the +first time. + +
+
+
+
+
+ + ☆ DiAMoNDBack: Diffusion-denoising Autoregressive Model for + Non-Deterministic Backmapping of Cα Protein Traces + + +
+ Coarse-grained molecular models of proteins permit access to length and time +scales unattainable by all-atom models and the simulation of processes that +occur on long-time scales such as aggregation and folding. The reduced +resolution realizes computational accelerations but an atomistic representation +can be vital for a complete understanding of mechanistic details. Backmapping +is the process of restoring all-atom resolution to coarse-grained molecular +models. In this work, we report DiAMoNDBack (Diffusion-denoising Autoregressive +Model for Non-Deterministic Backmapping) as an autoregressive denoising +diffusion probability model to restore all-atom details to coarse-grained +protein representations retaining only C{\alpha} coordinates. The +autoregressive generation process proceeds from the protein N-terminus to +C-terminus in a residue-by-residue fashion conditioned on the C{\alpha} trace +and previously backmapped backbone and side chain atoms within the local +neighborhood. The local and autoregressive nature of our model makes it +transferable between proteins. The stochastic nature of the denoising diffusion +process means that the model generates a realistic ensemble of backbone and +side chain all-atom configurations consistent with the coarse-grained C{\alpha} +trace. We train DiAMoNDBack over 65k+ structures from Protein Data Bank (PDB) +and validate it in applications to a hold-out PDB test set, +intrinsically-disordered protein structures from the Protein Ensemble Database +(PED), molecular dynamics simulations of fast-folding mini-proteins from DE +Shaw Research, and coarse-grained simulation data. We achieve state-of-the-art +reconstruction performance in terms of correct bond formation, avoidance of +side chain clashes, and diversity of the generated side chain configurational +states. We make DiAMoNDBack model publicly available as a free and open source +Python package. + +
+
+
+
+
+ + ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation ICCV 2023 + + +
+ Federated learning (FL) is a promising approach for enhancing data privacy +preservation, particularly for authentication systems. However, limited round +communications, scarce representation, and scalability pose significant +challenges to its deployment, hindering its full potential. In this paper, we +propose 'ProtoFL', Prototypical Representation Distillation based unsupervised +Federated Learning to enhance the representation power of a global model and +reduce round communication costs. Additionally, we introduce a local one-class +classifier based on normalizing flows to improve performance with limited data. +Our study represents the first investigation of using FL to improve one-class +classification performance. We conduct extensive experiments on five widely +used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and +Keystroke-Dynamics, to demonstrate the superior performance of our proposed +framework over previous methods in the literature. + +
+
+ comment: Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed + equally to this work +
+
+
+
+
+ + ☆ WEPRO: Weight Prediction for Efficient Optimization of Hybrid + Quantum-Classical Algorithms + + +
+ The exponential run time of quantum simulators on classical machines and long +queue depths and high costs of real quantum devices present significant +challenges in the effective training of Variational Quantum Algorithms (VQAs) +like Quantum Neural Networks (QNNs), Variational Quantum Eigensolver (VQE) and +Quantum Approximate Optimization Algorithm (QAOA). To address these +limitations, we propose a new approach, WEPRO (Weight Prediction), which +accelerates the convergence of VQAs by exploiting regular trends in the +parameter weights. We introduce two techniques for optimal prediction +performance namely, Naive Prediction (NaP) and Adaptive Prediction (AdaP). +Through extensive experimentation and training of multiple QNN models on +various datasets, we demonstrate that WEPRO offers a speedup of approximately +$2.25\times$ compared to standard training methods, while also providing +improved accuracy (up to $2.3\%$ higher) and loss (up to $6.1\%$ lower) with +low storage and computational overheads. We also evaluate WEPRO's effectiveness +in VQE for molecular ground-state energy estimation and in QAOA for graph +MaxCut. Our results show that WEPRO leads to speed improvements of up to +$3.1\times$ for VQE and $2.91\times$ for QAOA, compared to traditional +optimization techniques, while using up to $3.3\times$ less number of shots +(i.e., repeated circuit executions) per training iteration. + +
+
+
+
+
+ + ☆ Multifidelity Covariance Estimation via Regression on the Manifold of + Symmetric Positive Definite Matrices + + +
+ We introduce a multifidelity estimator of covariance matrices formulated as +the solution to a regression problem on the manifold of symmetric positive +definite matrices. The estimator is positive definite by construction, and the +Mahalanobis distance minimized to obtain it possesses properties which enable +practical computation. We show that our manifold regression multifidelity +(MRMF) covariance estimator is a maximum likelihood estimator under a certain +error model on manifold tangent space. More broadly, we show that our +Riemannian regression framework encompasses existing multifidelity covariance +estimators constructed from control variates. We demonstrate via numerical +examples that our estimator can provide significant decreases, up to one order +of magnitude, in squared estimation error relative to both single-fidelity and +other multifidelity covariance estimators. Furthermore, preservation of +positive definiteness ensures that our estimator is compatible with downstream +tasks, such as data assimilation and metric learning, in which this property is +essential. + +
+
+ comment: 30 pages + 15-page supplement +
+
+
+
+
+ + ☆ A Generalized Schwarz-type Non-overlapping Domain Decomposition Method + using Physics-constrained Neural Networks + + +
+ We present a meshless Schwarz-type non-overlapping domain decomposition +method based on artificial neural networks for solving forward and inverse +problems involving partial differential equations (PDEs). To ensure the +consistency of solutions across neighboring subdomains, we adopt a generalized +Robin-type interface condition, assigning unique Robin parameters to each +subdomain. These subdomain-specific Robin parameters are learned to minimize +the mismatch on the Robin interface condition, facilitating efficient +information exchange during training. Our method is applicable to both the +Laplace's and Helmholtz equations. It represents local solutions by an +independent neural network model which is trained to minimize the loss on the +governing PDE while strictly enforcing boundary and interface conditions +through an augmented Lagrangian formalism. A key strength of our method lies in +its ability to learn a Robin parameter for each subdomain, thereby enhancing +information exchange with its neighboring subdomains. We observe that the +learned Robin parameters adapt to the local behavior of the solution, domain +partitioning and subdomain location relative to the overall domain. Extensive +experiments on forward and inverse problems, including one-way and two-way +decompositions with crosspoints, demonstrate the versatility and performance of +our proposed approach. + +
+
+
+
+
+ + ☆ Augmented Box Replay: Overcoming Foreground Shift for Incremental Object + Detection + + +
+ In incremental learning, replaying stored samples from previous tasks +together with current task samples is one of the most efficient approaches to +address catastrophic forgetting. However, unlike incremental classification, +image replay has not been successfully applied to incremental object detection +(IOD). In this paper, we identify the overlooked problem of foreground shift as +the main reason for this. Foreground shift only occurs when replaying images of +previous tasks and refers to the fact that their background might contain +foreground objects of the current task. To overcome this problem, a novel and +efficient Augmented Box Replay (ABR) method is developed that only stores and +replays foreground objects and thereby circumvents the foreground shift +problem. In addition, we propose an innovative Attentive RoI Distillation loss +that uses spatial attention from region-of-interest (RoI) features to constrain +current model to focus on the most important information from old model. ABR +significantly reduces forgetting of previous classes while maintaining high +plasticity in current classes. Moreover, it considerably reduces the storage +requirements when compared to standard image replay. Comprehensive experiments +on Pascal-VOC and COCO datasets support the state-of-the-art performance of our +model. + +
+
+
+
+
+ + ☆ Practical Commercial 5G Standalone (SA) Uplink Throughput Prediction + + +
+ While the 5G New Radio (NR) network promises a huge uplift of the uplink +throughput, the improvement can only be seen when the User Equipment (UE) is +connected to the high-frequency millimeter wave (mmWave) band. With the rise of +uplink-intensive smartphone applications such as the real-time transmission of +UHD 4K/8K videos, and Virtual Reality (VR)/Augmented Reality (AR) contents, +uplink throughput prediction plays a huge role in maximizing the users' quality +of experience (QoE). In this paper, we propose using a ConvLSTM-based neural +network to predict the future uplink throughput based on past uplink throughput +and RF parameters. The network is trained using the data from real-world drive +tests on commercial 5G SA networks while riding commuter trains, which +accounted for various frequency bands, handover, and blind spots. To make sure +our model can be practically implemented, we then limited our model to only use +the information available via Android API, then evaluate our model using the +data from both commuter trains and other methods of transportation. The results +show that our model reaches an average prediction accuracy of 98.9\% with an +average RMSE of 1.80 Mbps across all unseen evaluation scenarios. + +
+
+
+
+
+ + ☆ A Machine Learning Approach to Two-Stage Adaptive Robust Optimization + + +
+ We propose an approach based on machine learning to solve two-stage linear +adaptive robust optimization (ARO) problems with binary here-and-now variables +and polyhedral uncertainty sets. We encode the optimal here-and-now decisions, +the worst-case scenarios associated with the optimal here-and-now decisions, +and the optimal wait-and-see decisions into what we denote as the strategy. We +solve multiple similar ARO instances in advance using the column and constraint +generation algorithm and extract the optimal strategies to generate a training +set. We train a machine learning model that predicts high-quality strategies +for the here-and-now decisions, the worst-case scenarios associated with the +optimal here-and-now decisions, and the wait-and-see decisions. We also +introduce an algorithm to reduce the number of different target classes the +machine learning algorithm needs to be trained on. We apply the proposed +approach to the facility location, the multi-item inventory control and the +unit commitment problems. Our approach solves ARO problems drastically faster +than the state-of-the-art algorithms with high accuracy. + +
+
+
+
+
+ + ☆ Optimal Control of Multiclass Fluid Queueing Networks: A Machine + Learning Approach + + +
+ We propose a machine learning approach to the optimal control of multiclass +fluid queueing networks (MFQNETs) that provides explicit and insightful control +policies. We prove that a threshold type optimal policy exists for MFQNET +control problems, where the threshold curves are hyperplanes passing through +the origin. We use Optimal Classification Trees with hyperplane splits (OCT-H) +to learn an optimal control policy for MFQNETs. We use numerical solutions of +MFQNET control problems as a training set and apply OCT-H to learn explicit +control policies. We report experimental results with up to 33 servers and 99 +classes that demonstrate that the learned policies achieve 100\% accuracy on +the test set. While the offline training of OCT-H can take days in large +networks, the online application takes milliseconds. + +
+
+
+
+
+ + ☆ Uncertainty-aware Grounded Action Transformation towards Sim-to-Real + Transfer for Traffic Signal Control + + +
+ Traffic signal control (TSC) is a complex and important task that affects the +daily lives of millions of people. Reinforcement Learning (RL) has shown +promising results in optimizing traffic signal control, but current RL-based +TSC methods are mainly trained in simulation and suffer from the performance +gap between simulation and the real world. In this paper, we propose a +simulation-to-real-world (sim-to-real) transfer approach called UGAT, which +transfers a learned policy trained from a simulated environment to a real-world +environment by dynamically transforming actions in the simulation with +uncertainty to mitigate the domain gap of transition dynamics. We evaluate our +method on a simulated traffic environment and show that it significantly +improves the performance of the transferred RL policy in the real world. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ In-Context Learning in Large Language Models Learns Label Relationships + but Is Not Conventional Learning + + +
+ The performance of Large Language Models (LLMs) on downstream tasks often +improves significantly when including examples of the input-label relationship +in the context. However, there is currently no consensus about how this +in-context learning (ICL) ability of LLMs works: for example, while Xie et al. +(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b) +argue ICL does not even learn label relationships from in-context examples. In +this paper, we study (1) how labels of in-context examples affect predictions, +(2) how label relationships learned during pre-training interact with +input-label examples provided in-context, and (3) how ICL aggregates label +information across in-context examples. Our findings suggests LLMs usually +incorporate information from in-context labels, but that pre-training and +in-context label relationships are treated differently, and that the model does +not consider all in-context information equally. Our results give insights into +understanding and aligning LLM behavior. + +
+
+
+
+
+ + ☆ Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences + from Longitudinal Electronic Health Records of US Military Veterans + + +
+ Early prediction of Alzheimer's disease (AD) is crucial for timely +intervention and treatment. This study aims to use machine learning approaches +to analyze longitudinal electronic health records (EHRs) of patients with AD +and identify signs and symptoms that can predict AD onset earlier. We used a +case-control design with longitudinal EHRs from the U.S. Department of Veterans +Affairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA +patients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9 +with controls by age, sex and clinical utilization with replacement. We used a +panel of AD-related keywords and their occurrences over time in a patient's +longitudinal EHRs as predictors for AD prediction with four machine learning +models. We performed subgroup analyses by age, sex, and race/ethnicity, and +validated the model in a hold-out and "unseen" VHA stations group. Model +discrimination, calibration, and other relevant metrics were reported for +predictions up to ten years before ICD-based diagnosis. The study population +included 16,701 cases and 39,097 matched controls. The average number of +AD-related keywords (e.g., "concentration", "speaking") per year increased +rapidly for cases as diagnosis approached, from around 10 to over 40, while +remaining flat at 10 for controls. The best model achieved high discriminative +accuracy (ROCAUC 0.997) for predictions using data from at least ten years +before ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow +goodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and +race/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine +learning models using AD-related keywords identified from EHR notes can predict +future AD diagnoses, suggesting its potential use for identifying AD risk using +EHR notes, offering an affordable way for early screening on large population. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect + Spurious Correlations? + + +
+ While deep neural network models offer unmatched classification performance, +they are prone to learning spurious correlations in the data. Such dependencies +on confounding information can be difficult to detect using performance metrics +if the test data comes from the same distribution as the training data. +Interpretable ML methods such as post-hoc explanations or inherently +interpretable classifiers promise to identify faulty model reasoning. However, +there is mixed evidence whether many of these techniques are actually able to +do so. In this paper, we propose a rigorous evaluation strategy to assess an +explanation technique's ability to correctly identify spurious correlations. +Using this strategy, we evaluate five post-hoc explanation techniques and one +inherently interpretable method for their ability to detect three types of +artificially added confounders in a chest x-ray diagnosis task. We find that +the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net +provide the best performance and can be used to reliably identify faulty model +behavior. + +
+
+
+
+
+ + ☆ Self-Supervised Learning for Audio-Based Emotion Recognition + + +
+ Emotion recognition models using audio input data can enable the development +of interactive systems with applications in mental healthcare, marketing, +gaming, and social media analysis. While the field of affective computing using +audio data is rich, a major barrier to achieve consistently high-performance +models is the paucity of available training labels. Self-supervised learning +(SSL) is a family of methods which can learn despite a scarcity of supervised +labels by predicting properties of the data itself. To understand the utility +of self-supervised learning for audio-based emotion recognition, we have +applied self-supervised learning pre-training to the classification of emotions +from the CMU- MOSEI's acoustic modality. Unlike prior papers that have +experimented with raw acoustic data, our technique has been applied to encoded +acoustic data. Our model is first pretrained to uncover the randomly-masked +timestamps of the acoustic data. The pre-trained model is then fine-tuned using +a small sample of annotated data. The performance of the final model is then +evaluated via several evaluation metrics against a baseline deep learning model +with an identical backbone architecture. We find that self-supervised learning +consistently improves the performance of the model across all metrics. This +work shows the utility of self-supervised learning for affective computing, +demonstrating that self-supervised learning is most useful when the number of +training examples is small, and that the effect is most pronounced for emotions +which are easier to classify such as happy, sad and anger. This work further +demonstrates that self-supervised learning works when applied to embedded +feature representations rather than the traditional approach of pre-training on +the raw input space. + +
+
+ comment: 8 pages, 9 figures, submitted to IEEE Transactions on Affective + Computing +
+
+
+
+
+ + ☆ Rapid detection of soil carbonates by means of NIR spectroscopy, deep + learning methods and phase quantification by powder Xray diffraction + + +
+ Soil NIR spectral absorbance/reflectance libraries are utilized towards +improving agricultural production and analysis of soil properties which are key +prerequisite for agroecological balance and environmental sustainability. +Carbonates in particular, represent a soil property which is mostly affected +even by mild, let alone extreme, changes of environmental conditions during +climate change. In this study we propose a rapid and efficient way to predict +carbonates content in soil by means of FT NIR reflectance spectroscopy and by +use of deep learning methods. We exploited multiple machine learning methods, +such as: 1) a MLP Regressor and 2) a CNN and compare their performance with +other traditional ML algorithms such as PLSR, Cubist and SVM on the combined +dataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples +reflectance spectra collected nationwide, and LUCAS TopSoil (European Soil +Library) which contains soil sample absorbance spectra from all over the +European Union, and use them to predict carbonate content on never before seen +soil samples. Soil samples in KSSL and in TopSoil spectral libraries were +acquired in the spectral region of visNIR, however in this study, only the NIR +spectral region was utilized. Quantification of carbonates by means of Xray +Diffraction is in good agreement with the volumetric method and the MLP +prediction. Our work contributes to rapid carbonates content prediction in soil +samples in cases where: 1) no volumetric method is available and 2) only NIR +spectra absorbance data are available. Up till now and to the best of our +knowledge, there exists no other study, that presents a prediction model +trained on such an extensive dataset with such promising results on unseen +data, undoubtedly supporting the notion that deep learning models present +excellent prediction tools for soil carbonates content. + +
+
+ comment: 39 pages, 5 figures +
+
+
+
+
+ + ☆ TabADM: Unsupervised Tabular Anomaly Detection with Diffusion Models + + +
+ Tables are an abundant form of data with use cases across all scientific +fields. Real-world datasets often contain anomalous samples that can negatively +affect downstream analysis. In this work, we only assume access to contaminated +data and present a diffusion-based probabilistic model effective for +unsupervised anomaly detection. Our model is trained to learn the density of +normal samples by utilizing a unique rejection scheme to attenuate the +influence of anomalies on the density estimation. At inference, we identify +anomalies as samples in low-density regions. We use real data to demonstrate +that our method improves detection capabilities over baselines. Furthermore, +our method is relatively stable to the dimension of the data and does not +require extensive hyperparameter tuning. + +
+
+
+
+
+ + ☆ An axiomatized PDE model of deep neural networks + + +
+ Inspired by the relation between deep neural network (DNN) and partial +differential equations (PDEs), we study the general form of the PDE models of +deep neural networks. To achieve this goal, we formulate DNN as an evolution +operator from a simple base model. Based on several reasonable assumptions, we +prove that the evolution operator is actually determined by +convection-diffusion equation. This convection-diffusion equation model gives +mathematical explanation for several effective networks. Moreover, we show that +the convection-diffusion model improves the robustness and reduces the +Rademacher complexity. Based on the convection-diffusion equation, we design a +new training method for ResNets. Experiments validate the performance of the +proposed method. + +
+
+
+
+
+ + ☆ Tackling the Curse of Dimensionality with Physics-Informed Neural + Networks + + +
+ The curse-of-dimensionality (CoD) taxes computational resources heavily with +exponentially increasing computational cost as the dimension increases. This +poses great challenges in solving high-dimensional PDEs as Richard Bellman +first pointed out over 60 years ago. While there has been some recent success +in solving numerically partial differential equations (PDEs) in high +dimensions, such computations are prohibitively expensive, and true scaling of +general nonlinear PDEs to high dimensions has never been achieved. In this +paper, we develop a new method of scaling up physics-informed neural networks +(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called +Stochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs +into pieces corresponding to different dimensions and samples randomly a subset +of these dimensional pieces in each iteration of training PINNs. We +theoretically prove the convergence guarantee and other desired properties of +the proposed method. We experimentally demonstrate that the proposed method +allows us to solve many notoriously hard high-dimensional PDEs, including the +Hamilton-Jacobi-Bellman and the Schr\"{o}dinger equations in thousands of +dimensions very fast on a single GPU using the PINNs mesh-free approach. For +example, we solve nontrivial nonlinear PDEs (the HJB-Lin equation and the BSB +equation) in 100,000 dimensions in 6 hours on a single GPU using SDGD with +PINNs. Since SDGD is a general training methodology of PINNs, SDGD can be +applied to any current and future variants of PINNs to scale them up for +arbitrary high-dimensional PDEs. + +
+
+ comment: 32 pages, 5 figures +
+
+
+
+
+ + ☆ Physics-Informed Machine Learning of Argon Gas-Driven Melt Pool Dynamics + + +
+ Melt pool dynamics in metal additive manufacturing (AM) is critical to +process stability, microstructure formation, and final properties of the +printed materials. Physics-based simulation including computational fluid +dynamics (CFD) is the dominant approach to predict melt pool dynamics. However, +the physics-based simulation approaches suffer from the inherent issue of very +high computational cost. This paper provides a physics-informed machine +learning (PIML) method by integrating neural networks with the governing +physical laws to predict the melt pool dynamics such as temperature, velocity, +and pressure without using any training data on velocity. This approach avoids +solving the highly non-linear Navier-Stokes equation numerically, which +significantly reduces the computational cost. The difficult-to-determine model +constants of the governing equations of the melt pool can also be inferred +through data-driven discovery. In addition, the physics-informed neural network +(PINN) architecture has been optimized for efficient model training. The +data-efficient PINN model is attributed to the soft penalty by incorporating +governing partial differential equations (PDEs), initial conditions, and +boundary conditions in the PINN model. + +
+
+
+
+
+ + ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC + + +
+ Image outlier detection (OD) is crucial for ensuring the quality and accuracy +of image datasets used in computer vision tasks. The majority of OD algorithms, +however, have not been targeted toward image data. Consequently, the results of +applying such algorithms to images are often suboptimal. In this work, we +propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for +images. By comparing images in a RANSAC-based approach, our algorithm +automatically predicts the outlier score of each image without additional +training or label information. We evaluate RANSAC-NN against state-of-the-art +OD algorithms on 15 diverse datasets. Without any hyperparameter tuning, +RANSAC-NN consistently performs favorably in contrast to other algorithms in +almost every dataset category. Furthermore, we provide a detailed analysis to +understand each RANSAC-NN component, and we demonstrate its potential +applications in image mislabeled detection. Code for RANSAC-NN is provided at +https://github.com/mxtsai/ransac-nn + +
+
+ comment: 19 pages, 18 figures +
+
+
+
+
+ + ☆ Comparative analysis using classification methods versus early stage + diabetes + + +
+ In this research work, a comparative analysis was carried out using +classification methods such as: Discriminant Analysis and Logistic Regression +to subsequently predict whether a person may have the presence of early stage +diabetes. For this purpose, use was made of a database of the UC IRVINE +platform of the year 2020 where specific variables that influence diabetes were +used for a better result. Likewise in terms of methodology, the corresponding +analysis was performed for each of the 3 classification methods and then take +them to a comparative table and analyze the results obtained. Finally we can +add that the majority of the studies carried out applying the classification +methods to the diseases can be clearly seen that there is a certain attachment +and more use of the logistic regression classification method, on the other +hand, in the results we could see significant differences in terms of the 2 +classification methods that were applied, which was valuable information for +later drawing final conclusions. + +
+
+
+
+
+ + ☆ ResWCAE: Biometric Pattern Image Denoising Using Residual + Wavelet-Conditioned Autoencoder + + +
+ The utilization of biometric authentication with pattern images is +increasingly popular in compact Internet of Things (IoT) devices. However, the +reliability of such systems can be compromised by image quality issues, +particularly in the presence of high levels of noise. While state-of-the-art +deep learning algorithms designed for generic image denoising have shown +promise, their large number of parameters and lack of optimization for unique +biometric pattern retrieval make them unsuitable for these devices and +scenarios. In response to these challenges, this paper proposes a lightweight +and robust deep learning architecture, the Residual Wavelet-Conditioned +Convolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD) +regularization, designed specifically for fingerprint image denoising. Res-WCAE +comprises two encoders - an image encoder and a wavelet encoder - and one +decoder. Residual connections between the image encoder and decoder are +leveraged to preserve fine-grained spatial features, where the bottleneck layer +conditioned on the compressed representation of features obtained from the +wavelet encoder using approximation and detail subimages in the +wavelet-transform domain. The effectiveness of Res-WCAE is evaluated against +several state-of-the-art denoising methods, and the experimental results +demonstrate that Res-WCAE outperforms these methods, particularly for heavily +degraded fingerprint images in the presence of high levels of noise. Overall, +Res-WCAE shows promise as a solution to the challenges faced by biometric +authentication systems in compact IoT devices. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking + and Machine Learning Regression Approach + + +
+ The COVID-19 pandemic has created a global health crisis, with an urgent need +for effective treatments. Drug repurposing has emerged as a promising solution, +as it can save time, cost, and labor. However, the number of identified +repurposed drugs for COVID-19 treatment remains limited, and there is a need +for more efficient and comprehensive drug repurposing approaches. In this +study, we aimed to identify potential therapeutic candidates for COVID-19 +treatment through drug repurposing using a combination of molecular docking and +machine learning regression approaches. We utilized the Zinc database to screen +5903 World-approved drugs for their potential to target the main protease 3CL +of SARS-CoV-2, which is a key enzyme in the replication of the virus. We +performed molecular docking to evaluate the binding affinity of the drugs to +the main protease 3CL, and used several machine learning regression approaches +for QSAR modeling to identify drugs with high binding affinity. Our results +showed that the Decision Tree Regression (DTR) model had the best statistical +measures of R2 and RMSE, and we shortlisted six promising drugs within the +range of -15 kcal/mol to -13 kcal/mol. These drugs have novel repurposing +potential, except for one antiviral ZINC203757351 compound that has already +been identified in other studies. We further analyzed the physiochemical and +pharmacokinetic properties of these top-ranked selected drugs and their best +binding interaction for specific target protease 3CLpro. Our study provides an +efficient framework for drug repurposing against COVID-19, and demonstrates the +potential of combining molecular docking with machine learning regression +approaches to accelerate the identification of potential therapeutic +candidates. Our findings contribute to the larger goal of finding effective +treatments for COVID-19, which is a critical global health challenge. + +
+
+ comment: 27 Pages +
+
+
+
+
+ + ♻ ☆ Improving Generalizability of Graph Anomaly Detection Models via Data + Augmentation + + +
+ Graph anomaly detection (GAD) is a vital task since even a few anomalies can +pose huge threats to benign users. Recent semi-supervised GAD methods, which +can effectively leverage the available labels as prior knowledge, have achieved +superior performances than unsupervised methods. In practice, people usually +need to identify anomalies on new (sub)graphs to secure their business, but +they may lack labels to train an effective detection model. One natural idea is +to directly adopt a trained GAD model to the new (sub)graph for testing. +However, we find that existing semi-supervised GAD methods suffer from poor +generalization issue, i.e., well-trained models could not perform well on an +unseen area (i.e., not accessible in training) of the same graph. It may cause +great troubles. In this paper, we base on the phenomenon and propose a general +and novel research problem of generalized graph anomaly detection that aims to +effectively identify anomalies on both the training-domain graph and unseen +testing graph to eliminate potential dangers. Nevertheless, it is a challenging +task since only limited labels are available, and the normal background may +differ between training and testing data. Accordingly, we propose a data +augmentation method named \textit{AugAN} (\uline{Aug}mentation for +\uline{A}nomaly and \uline{N}ormal distributions) to enrich training data and +boost the generalizability of GAD models. Experiments verify the effectiveness +of our method in improving model generalizability. + +
+
+ comment: The updated version is accepted by TKDE 2023. Please refer to + arXiv:2306.10534v1 +
+
+
+
+
+ + ♻ ☆ TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation + + +
+ Sequential recommendation models sequences of historical user-item +interactive behaviors (or referred as token) to better infer dynamic +preferences. Fueled by the improved neural network architectures such as RNN, +CNN and Transformer, this field has enjoyed rapid performance boost in the past +years. Recent progress on all-MLP models lights on an efficient method with +less intensive computation, token-mixing MLP, to learn the transformation +patterns among historical behaviors. However, due to the inherent +fully-connection design that allows the unrestricted cross-token communication +and ignores the chronological order, we find that directly applying +token-mixing MLP into sequential recommendation leads to subpar performance. In +this paper, we present a purely MLP-based sequential recommendation +architecture TriMLP with a novel \underline{Tri}angular Mixer where the +modified \underline{MLP} endows tokens with ordered interactions. As the +cross-token interaction in MLP is actually matrix multiplication, Triangular +Mixer drops the lower-triangle neurons in the weight matrix and thus blocks the +connections from future tokens, which prevents information leakage and improves +prediction capability under the standard auto-regressive training fashion. To +further model long and short-term preferences on fine-grained level, the mixer +adopts a dual-branch structure based on the delicate MLP described above, +namely global and local mixing, to separately capture the sequential long-range +dependencies and local patterns. Empirical study on 9 different scale datasets +(contain 50K\textasciitilde20M behaviors) of various benchmarks, including +MovieLens, Amazon and Tenrec, demonstrates that TriMLP attains promising and +stable accuracy/efficiency trade-off, i.e., averagely surpasses several +state-of-the-art baselines by 5.32\% and saves 8.44\% inference time cost. + +
+
+ comment: 15 pages, 9 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ TF-GNN: Graph Neural Networks in TensorFlow + + +
+ TensorFlow-GNN (TF-GNN) is a scalable library for Graph Neural Networks in +TensorFlow. It is designed from the bottom up to support the kinds of rich +heterogeneous graph data that occurs in today's information ecosystems. In +addition to enabling machine learning researchers and advanced developers, +TF-GNN offers low-code solutions to empower the broader developer community in +graph learning. Many production models at Google use TF-GNN, and it has been +recently released as an open source project. In this paper we describe the +TF-GNN data model, its Keras message passing API, and relevant capabilities +such as graph sampling and distributed training. + +
+
+
+
+
+ + ♻ ☆ From Graph Generation to Graph Classification + + +
+ This note describes a new approach to classifying graphs that leverages graph +generative models (GGM). Assuming a GGM that defines a joint probability +distribution over graphs and their class labels, I derive classification +formulas for the probability of a class label given a graph. A new conditional +ELBO can be used to train a generative graph auto-encoder model for +discrimination. While leveraging generative models for classification has been +well explored for non-relational i.i.d. data, to our knowledge it is a novel +approach to graph classification. + +
+
+ comment: I welcome suggestions, comments, and proposals for collaboration to + develop further the ideas in this paper. Please email oschulte@cs.sfu.ca. I + am grateful to Renjie Liao for helpful comments +
+
+
+
+
+ + ♻ ☆ When Personalization Harms: Reconsidering the Use of Group Attributes in + Prediction ICML 2023 + + +
+ Machine learning models are often personalized with categorical attributes +that are protected, sensitive, self-reported, or costly to acquire. In this +work, we show models that are personalized with group attributes can reduce +performance at a group level. We propose formal conditions to ensure the "fair +use" of group attributes in prediction tasks by training one additional model +-- i.e., collective preference guarantees to ensure that each group who +provides personal data will receive a tailored gain in performance in return. +We present sufficient conditions to ensure fair use in empirical risk +minimization and characterize failure modes that lead to fair use violations +due to standard practices in model development and deployment. We present a +comprehensive empirical study of fair use in clinical prediction tasks. Our +results demonstrate the prevalence of fair use violations in practice and +illustrate simple interventions to mitigate their harm. + +
+
+ comment: ICML 2023 Oral +
+
+
+
+
+ + ♻ ☆ Neural Active Learning on Heteroskedastic Distributions + + +
+ Models that can actively seek out the best quality training data hold the +promise of more accurate, adaptable, and efficient machine learning. Active +learning techniques often tend to prefer examples that are the most difficult +to classify. While this works well on homogeneous datasets, we find that it can +lead to catastrophic failures when performed on multiple distributions with +different degrees of label noise or heteroskedasticity. These active learning +algorithms strongly prefer to draw from the distribution with more noise, even +if their examples have no informative structure (such as solid color images +with random labels). To this end, we demonstrate the catastrophic failure of +these active learning algorithms on heteroskedastic distributions and propose a +fine-tuning-based approach to mitigate these failures. Further, we propose a +new algorithm that incorporates a model difference scoring function for each +data point to filter out the noisy examples and sample clean examples that +maximize accuracy, outperforming the existing active learning techniques on the +heteroskedastic datasets. We hope these observations and techniques are +immediately helpful to practitioners and can help to challenge common +assumptions in the design of active learning algorithms. + +
+
+
+
+
+ + ♻ ☆ Toward Efficient Gradient-Based Value Estimation + + +
+ Gradient-based methods for value estimation in reinforcement learning have +favorable stability properties, but they are typically much slower than +Temporal Difference (TD) learning methods. We study the root causes of this +slowness and show that Mean Square Bellman Error (MSBE) is an ill-conditioned +loss function in the sense that its Hessian has large condition-number. To +resolve the adverse effect of poor conditioning of MSBE on gradient based +methods, we propose a low complexity batch-free proximal method that +approximately follows the Gauss-Newton direction and is asymptotically robust +to parameterization. Our main algorithm, called RANS, is efficient in the sense +that it is significantly faster than the residual gradient methods while having +almost the same computational complexity, and is competitive with TD on the +classic problems that we tested. + +
+
+
+
+
+ + ♻ ☆ Log-linear Guardedness and its Implications ACL 2023 + + +
+ Methods for erasing human-interpretable concepts from neural representations +that assume linearity have been found to be tractable and useful. However, the +impact of this removal on the behavior of downstream classifiers trained on the +modified representations is not fully understood. In this work, we formally +define the notion of log-linear guardedness as the inability of an adversary to +predict the concept directly from the representation, and study its +implications. We show that, in the binary case, under certain assumptions, a +downstream log-linear model cannot recover the erased concept. However, we +demonstrate that a multiclass log-linear model \emph{can} be constructed that +indirectly recovers the concept in some cases, pointing to the inherent +limitations of log-linear guardedness as a downstream bias mitigation +technique. These findings shed light on the theoretical limitations of linear +erasure methods and highlight the need for further research on the connections +between intrinsic and extrinsic bias in neural models. + +
+
+ comment: Accepted as a long paper in ACL 2023 +
+
+
+
+
+ + ♻ ☆ ManimML: Communicating Machine Learning Architectures with Animation + + +
+ There has been an explosion in interest in machine learning (ML) in recent +years due to its applications to science and engineering. However, as ML +techniques have advanced, tools for explaining and visualizing novel ML +algorithms have lagged behind. Animation has been shown to be a powerful tool +for making engaging visualizations of systems that dynamically change over +time, which makes it well suited to the task of communicating ML algorithms. +However, the current approach to animating ML algorithms is to handcraft +applications that highlight specific algorithms or use complex generalized +animation software. We developed ManimML, an open-source Python library for +easily generating animations of ML algorithms directly from code. We sought to +leverage ML practitioners' preexisting knowledge of programming rather than +requiring them to learn complex animation software. ManimML has a familiar +syntax for specifying neural networks that mimics popular deep learning +frameworks like Pytorch. A user can take a preexisting neural network +architecture and easily write a specification for an animation in ManimML, +which will then automatically compose animations for different components of +the system into a final animation of the entire neural network. ManimML is open +source and available at https://github.com/helblazer811/ManimML. + +
+
+
+
+
+ + ♻ ☆ A Survey of Some Density Based Clustering Techniques + + +
+ Density Based Clustering are a type of Clustering methods using in data +mining for extracting previously unknown patterns from data sets. There are a +number of density based clustering methods such as DBSCAN, OPTICS, DENCLUE, +VDBSCAN, DVBSCAN, DBCLASD and ST-DBSCAN. In this paper, a study of these +methods is done along with their characteristics, advantages and disadvantages +and most importantly, their applicability to different types of data sets to +mine useful and appropriate patterns. + +
+
+ comment: 4 pages, 1 figure, conference paper +
+
+
+
+
+ + ♻ ☆ Deep Learning Models for Water Stage Predictions in South Florida + + +
+ Simulating and predicting water levels in river systems is essential for +flood warnings, hydraulic operations, and flood mitigations. In the engineering +field, tools such as HEC-RAS, MIKE, and SWMM are used to build detailed +physics-based hydrological and hydraulic computational models to simulate the +entire watershed, thereby predicting the water stage at any point in the +system. However, these physics-based models are computationally intensive, +especially for large watersheds and for longer simulations. To overcome this +problem, we train several deep learning (DL) models for use as surrogate models +to rapidly predict the water stage. The downstream stage of the Miami River in +South Florida is chosen as a case study for this paper. The dataset is from +January 1, 2010, to December 31, 2020, downloaded from the DBHYDRO database of +the South Florida Water Management District (SFWMD). Extensive experiments show +that the performance of the DL models is comparable to that of the +physics-based models, even during extreme precipitation conditions (i.e., +tropical storms). Furthermore, we study the decline in prediction accuracy of +the DL models with an increase in prediction lengths. In order to predict the +water stage in the future, our DL models use measured variables of the river +system from the recent past as well as covariates that can be reliably +predicted in the near future. In summary, the deep learning models achieve +comparable or better error rates with at least 1000x speedup in comparison to +the physics-based models. + +
+
+
+
+
+ + ♻ ☆ LAnoBERT: System Log Anomaly Detection based on BERT Masked Language + Model + + +
+ The system log generated in a computer system refers to large-scale data that +are collected simultaneously and used as the basic data for determining errors, +intrusion and abnormal behaviors. The aim of system log anomaly detection is to +promptly identify anomalies while minimizing human intervention, which is a +critical problem in the industry. Previous studies performed anomaly detection +through algorithms after converting various forms of log data into a +standardized template using a parser. Particularly, a template corresponding to +a specific event should be defined in advance for all the log data using which +the information within the log key may get lost. In this study, we propose +LAnoBERT, a parser free system log anomaly detection method that uses the BERT +model, exhibiting excellent natural language processing performance. The +proposed method, LAnoBERT, learns the model through masked language modeling, +which is a BERT-based pre-training method, and proceeds with unsupervised +learning-based anomaly detection using the masked language modeling loss +function per log key during the test process. In addition, we also propose an +efficient inference process to establish a practically applicable pipeline to +the actual system. Experiments on three well-known log datasets, i.e., HDFS, +BGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly +detection performance compared to unsupervised learning-based benchmark models, +but also it resulted in a comparable performance with supervised learning-based +benchmark models. + +
+
+
+
+
+ + ♻ ☆ CLIPTER: Looking at the Bigger Picture in Scene Text Recognition ICCV 2023 + + +
+ Reading text in real-world scenarios often requires understanding the context +surrounding it, especially when dealing with poor-quality text. However, +current scene text recognizers are unaware of the bigger picture as they +operate on cropped text images. In this study, we harness the representative +capabilities of modern vision-language models, such as CLIP, to provide +scene-level information to the crop-based recognizer. We achieve this by fusing +a rich representation of the entire image, obtained from the vision-language +model, with the recognizer word-level features via a gated cross-attention +mechanism. This component gradually shifts to the context-enhanced +representation, allowing for stable fine-tuning of a pretrained recognizer. We +demonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP +TExt Recognition), on leading text recognition architectures and achieve +state-of-the-art results across multiple benchmarks. Furthermore, our analysis +highlights improved robustness to out-of-vocabulary words and enhanced +generalization in low-data regimes. + +
+
+ comment: Accepted for publication by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SentimentGPT: Exploiting GPT for Advanced Sentiment Analysis and its + Departure from Current Machine Learning + + +
+ This study presents a thorough examination of various Generative Pretrained +Transformer (GPT) methodologies in sentiment analysis, specifically in the +context of Task 4 on the SemEval 2017 dataset. Three primary strategies are +employed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2) +fine-tuning GPT models, and 3) an inventive approach to embedding +classification. The research yields detailed comparative insights among these +strategies and individual GPT models, revealing their unique strengths and +potential limitations. Additionally, the study compares these GPT-based +methodologies with other current, high-performing models previously used with +the same dataset. The results illustrate the significant superiority of the GPT +approaches in terms of predictive performance, more than 22\% in F1-score +compared to the state-of-the-art. Further, the paper sheds light on common +challenges in sentiment analysis tasks, such as understanding context and +detecting sarcasm. It underscores the enhanced capabilities of the GPT models +to effectively handle these complexities. Taken together, these findings +highlight the promising potential of GPT models in sentiment analysis, setting +the stage for future research in this field. The code can be found at +https://github.com/DSAatUSU/SentimentGPT + +
+
+
+
+
+ + ♻ ☆ Frouros: A Python library for drift detection in machine learning + systems + + +
+ Frouros is an open-source Python library capable of detecting drift in +machine learning systems. It provides a combination of classical and more +recent algorithms for drift detection: both concept and data drift. We have +designed it with the objective of making it compatible with any machine +learning framework and easily adaptable to real-world use cases. The library is +developed following a set of best development and continuous integration +practices to ensure ease of maintenance and extensibility. The source code is +available at https://github.com/IFCA/frouros. + +
+
+ comment: 11 pages, 1 table +
+
+
+
+
+ + ♻ ☆ Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus + Sample Average Approximation: A Stochastic Dominance Perspective + + +
+ In data-driven stochastic optimization, model parameters of the underlying +distribution need to be estimated from data in addition to the optimization +task. Recent literature considers integrating the estimation and optimization +processes by selecting model parameters that lead to the best empirical +objective performance. This integrated approach, which we call +integrated-estimation-optimization (IEO), can be readily shown to outperform +simple estimate-then-optimize (ETO) when the model is misspecified. In this +paper, we show that a reverse behavior appears when the model class is +well-specified and there is sufficient data. Specifically, for a general class +of nonlinear stochastic optimization problems, we show that simple ETO +outperforms IEO asymptotically when the model class covers the ground truth, in +the strong sense of stochastic dominance of the regret. Namely, the entire +distribution of the regret, not only its mean or other moments, is always +better for ETO compared to IEO. Our results also apply to constrained, +contextual optimization problems where the decision depends on observed +features. Whenever applicable, we also demonstrate how standard sample average +approximation (SAA) performs the worst when the model class is well-specified +in terms of regret, and best when it is misspecified. Finally, we provide +experimental results to support our theoretical comparisons and illustrate when +our insights hold in finite-sample regimes and under various degrees of +misspecification. + +
+
+
+
+
+ + ♻ ☆ MGR: Multi-generator Based Rationalization ACL 2023 + + +
+ Rationalization is to employ a generator and a predictor to construct a +self-explaining NLP model in which the generator selects a subset of +human-intelligible pieces of the input text to the following predictor. +However, rationalization suffers from two key challenges, i.e., spurious +correlation and degeneration, where the predictor overfits the spurious or +meaningless pieces solely selected by the not-yet well-trained generator and in +turn deteriorates the generator. Although many studies have been proposed to +address the two challenges, they are usually designed separately and do not +take both of them into account. In this paper, we propose a simple yet +effective method named MGR to simultaneously solve the two problems. The key +idea of MGR is to employ multiple generators such that the occurrence stability +of real pieces is improved and more meaningful pieces are delivered to the +predictor. Empirically, we show that MGR improves the F1 score by up to 20.9% +as compared to state-of-the-art methods. Codes are available at +https://github.com/jugechengzi/Rationalization-MGR . + +
+
+ comment: ACL 2023, oral presentation. Fixed some typos and clarified some + implementation details. arXiv admin note: text overlap with arXiv:2209.08285 +
+
+
+
+
+ + ♻ ☆ NetGPT: A Native-AI Network Architecture Beyond Provisioning + Personalized Generative Services + + +
+ Large language models (LLMs) have triggered tremendous success to empower +daily life by generative information, and the personalization of LLMs could +further contribute to their applications due to better alignment with human +intents. Towards personalized generative services, a collaborative cloud-edge +methodology sounds promising, as it facilitates the effective orchestration of +heterogeneous distributed communication and computing resources. In this +article, after discussing the pros and cons of several candidate cloud-edge +collaboration techniques, we put forward NetGPT to capably deploy appropriate +LLMs at the edge and the cloud in accordance with their computing capacity. In +addition, edge LLMs could efficiently leverage location-based information for +personalized prompt completion, thus benefiting the interaction with cloud +LLMs. After deploying representative open-source LLMs (e.g., GPT-2-base and +LLaMA model) at the edge and the cloud, we present the feasibility of NetGPT on +the basis of low-rank adaptation-based light-weight fine-tuning. Subsequently, +we highlight substantial essential changes required for a native artificial +intelligence (AI) network architecture towards NetGPT, with special emphasis on +deeper integration of communications and computing resources and careful +calibration of logical AI workflow. Furthermore, we demonstrate several +by-product benefits of NetGPT, given edge LLM's astonishing capability to +predict trends and infer intents, which possibly leads to a unified solution +for intelligent network management \& orchestration. In a nutshell, we argue +that NetGPT is a promising native-AI network architecture beyond provisioning +personalized generative services. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ♻ ☆ LaunchpadGPT: Language Model as Music Visualization Designer on + Launchpad + + +
+ Launchpad is a musical instrument that allows users to create and perform +music by pressing illuminated buttons. To assist and inspire the design of the +Launchpad light effect, and provide a more accessible approach for beginners to +create music visualization with this instrument, we proposed the LaunchpadGPT +model to generate music visualization designs on Launchpad automatically. Based +on the language model with excellent generation ability, our proposed +LaunchpadGPT takes an audio piece of music as input and outputs the lighting +effects of Launchpad-playing in the form of a video (Launchpad-playing video). +We collect Launchpad-playing videos and process them to obtain music and +corresponding video frame of Launchpad-playing as prompt-completion pairs, to +train the language model. The experiment result shows the proposed method can +create better music visualization than random generation methods and hold the +potential for a broader range of music visualization applications. Our code is +available at https://github.com/yunlong10/LaunchpadGPT/. + +
+
+ comment: Accepted by International Computer Music Conference (ICMC) 2023 +
+
+
+
+
+ + ♻ ☆ CLIPSonic: Text-to-Audio Synthesis with Unlabeled Videos and Pretrained + Language-Vision Models SP + + +
+ Recent work has studied text-to-audio synthesis using large amounts of paired +text-audio data. However, audio recordings with high-quality text annotations +can be difficult to acquire. In this work, we approach text-to-audio synthesis +using unlabeled videos and pretrained language-vision models. We propose to +learn the desired text-audio correspondence by leveraging the visual modality +as a bridge. We train a conditional diffusion model to generate the audio track +of a video, given a video frame encoded by a pretrained contrastive +language-image pretraining (CLIP) model. At test time, we first explore +performing a zero-shot modality transfer and condition the diffusion model with +a CLIP-encoded text query. However, we observe a noticeable performance drop +with respect to image queries. To close this gap, we further adopt a pretrained +diffusion prior model to generate a CLIP image embedding given a CLIP text +embedding. Our results show the effectiveness of the proposed method, and that +the pretrained diffusion prior can reduce the modality transfer gap. While we +focus on text-to-audio synthesis, the proposed model can also generate audio +from image queries, and it shows competitive performance against a +state-of-the-art image-to-audio synthesis model in a subjective listening test. +This study offers a new direction of approaching text-to-audio synthesis that +leverages the naturally-occurring audio-visual correspondence in videos and the +power of pretrained language-vision models. + +
+
+ comment: Accepted by WASPAA 2023. Demo: + https://salu133445.github.io/clipsonic/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 18 + +
+
+
+ + ☆ The Imitation Game: Detecting Human and AI-Generated Texts in the Era of + Large Language Models + + +
+ The potential of artificial intelligence (AI)-based large language models +(LLMs) holds considerable promise in revolutionizing education, research, and +practice. However, distinguishing between human-written and AI-generated text +has become a significant task. This paper presents a comparative study, +introducing a novel dataset of human-written and LLM-generated texts in +different genres: essays, stories, poetry, and Python code. We employ several +machine learning models to classify the texts. Results demonstrate the efficacy +of these models in discerning between human and AI-generated text, despite the +dataset's limited sample size. However, the task becomes more challenging when +classifying GPT-generated text, particularly in story writing. The results +indicate that the models exhibit superior performance in binary classification +tasks, such as distinguishing human-generated text from a specific LLM, +compared to the more complex multiclass tasks that involve discerning among +human-generated and multiple LLMs. Our findings provide insightful implications +for AI text detection while our dataset paves the way for future research in +this evolving area. + +
+
+
+
+
+ + ☆ Identifying Misinformation on YouTube through Transcript Contextual + Analysis with Transformer Models + + +
+ Misinformation on YouTube is a significant concern, necessitating robust +detection strategies. In this paper, we introduce a novel methodology for video +classification, focusing on the veracity of the content. We convert the +conventional video classification task into a text classification task by +leveraging the textual content derived from the video transcripts. We employ +advanced machine learning techniques like transfer learning to solve the +classification challenge. Our approach incorporates two forms of transfer +learning: (a) fine-tuning base transformer models such as BERT, RoBERTa, and +ELECTRA, and (b) few-shot learning using sentence-transformers MPNet and +RoBERTa-large. We apply the trained models to three datasets: (a) YouTube +Vaccine-misinformation related videos, (b) YouTube Pseudoscience videos, and +(c) Fake-News dataset (a collection of articles). Including the Fake-News +dataset extended the evaluation of our approach beyond YouTube videos. Using +these datasets, we evaluated the models distinguishing valid information from +misinformation. The fine-tuned models yielded Matthews Correlation +Coefficient>0.81, accuracy>0.90, and F1 score>0.90 in two of three datasets. +Interestingly, the few-shot models outperformed the fine-tuned ones by 20% in +both Accuracy and F1 score for the YouTube Pseudoscience dataset, highlighting +the potential utility of this approach -- especially in the context of limited +training data. + +
+
+
+
+
+ + ☆ Modality Confidence Aware Training for Robust End-to-End Spoken Language + Understanding INTERSPEECH 2023 + + +
+ End-to-end (E2E) spoken language understanding (SLU) systems that generate a +semantic parse from speech have become more promising recently. This approach +uses a single model that utilizes audio and text representations from +pre-trained speech recognition models (ASR), and outperforms traditional +pipeline SLU systems in on-device streaming scenarios. However, E2E SLU systems +still show weakness when text representation quality is low due to ASR +transcription errors. To overcome this issue, we propose a novel E2E SLU system +that enhances robustness to ASR errors by fusing audio and text representations +based on the estimated modality confidence of ASR hypotheses. We introduce two +novel techniques: 1) an effective method to encode the quality of ASR +hypotheses and 2) an effective approach to integrate them into E2E SLU models. +We show accuracy improvements on STOP dataset and share the analysis to +demonstrate the effectiveness of our approach. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Explainable Topic-Enhanced Argument Mining from Heterogeneous Sources + + +
+ Given a controversial target such as ``nuclear energy'', argument mining aims +to identify the argumentative text from heterogeneous sources. Current +approaches focus on exploring better ways of integrating the target-associated +semantic information with the argumentative text. Despite their empirical +successes, two issues remain unsolved: (i) a target is represented by a word or +a phrase, which is insufficient to cover a diverse set of target-related +subtopics; (ii) the sentence-level topic information within an argument, which +we believe is crucial for argument mining, is ignored. To tackle the above +issues, we propose a novel explainable topic-enhanced argument mining approach. +Specifically, with the use of the neural topic model and the language model, +the target information is augmented by explainable topic representations. +Moreover, the sentence-level topic information within the argument is captured +by minimizing the distance between its latent topic distribution and its +semantic representation through mutual learning. Experiments have been +conducted on the benchmark dataset in both the in-target setting and the +cross-target setting. Results demonstrate the superiority of the proposed model +against the state-of-the-art baselines. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ A Zero-shot and Few-shot Study of Instruction-Finetuned Large Language + Models Applied to Clinical and Biomedical Tasks + + +
+ We evaluate four state-of-the-art instruction-tuned large language models +(LLMs) -- ChatGPT, Flan-T5 UL2, Tk-Instruct, and Alpaca -- on a set of 13 +real-world clinical and biomedical natural language processing (NLP) tasks in +English, such as named-entity recognition (NER), question-answering (QA), +relation extraction (RE), etc. Our overall results demonstrate that the +evaluated LLMs begin to approach performance of state-of-the-art models in +zero- and few-shot scenarios for most tasks, and particularly well for the QA +task, even though they have never seen examples from these tasks before. +However, we observed that the classification and RE tasks perform below what +can be achieved with a specifically trained model for the medical field, such +as PubMedBERT. Finally, we noted that no LLM outperforms all the others on all +the studied tasks, with some models being better suited for certain tasks than +others. + +
+
+ comment: Under review process +
+
+
+
+
+ + ☆ Revisiting Distillation for Continual Learning on Visual Question + Localized-Answering in Robotic Surgery MICCAI 2023 + + +
+ The visual-question localized-answering (VQLA) system can serve as a +knowledgeable assistant in surgical education. Except for providing text-based +answers, the VQLA system can highlight the interested region for better +surgical scene understanding. However, deep neural networks (DNNs) suffer from +catastrophic forgetting when learning new knowledge. Specifically, when DNNs +learn on incremental classes or tasks, their performance on old tasks drops +dramatically. Furthermore, due to medical data privacy and licensing issues, it +is often difficult to access old data when updating continual learning (CL) +models. Therefore, we develop a non-exemplar continual surgical VQLA framework, +to explore and balance the rigidity-plasticity trade-off of DNNs in a +sequential learning paradigm. We revisit the distillation loss in CL tasks, and +propose rigidity-plasticity-aware distillation (RP-Dist) and self-calibrated +heterogeneous distillation (SH-Dist) to preserve the old knowledge. The weight +aligning (WA) technique is also integrated to adjust the weight bias between +old and new tasks. We further establish a CL framework on three public surgical +datasets in the context of surgical settings that consist of overlapping +classes between old and new surgical VQLA tasks. With extensive experiments, we +demonstrate that our proposed method excellently reconciles learning and +forgetting on the continual surgical VQLA over conventional CL methods. Our +code is publicly accessible. + +
+
+ comment: To appear in MICCAI 2023. Code availability: + https://github.com/longbai1006/CS-VQLA +
+
+
+
+
+ + ☆ Psy-LLM: Scaling up Global Mental Health Psychological Services with + AI-based Large Language Models + + +
+ The demand for psychological counseling has grown significantly in recent +years, particularly with the global outbreak of COVID-19, which has heightened +the need for timely and professional mental health support. Online +psychological counseling has emerged as the predominant mode of providing +services in response to this demand. In this study, we propose the Psy-LLM +framework, an AI-based system leveraging Large Language Models (LLMs) for +question-answering in online psychological consultation. Our framework combines +pre-trained LLMs with real-world professional Q&A from psychologists and +extensively crawled psychological articles. The Psy-LLM framework serves as a +front-end tool for healthcare professionals, allowing them to provide immediate +responses and mindfulness activities to alleviate patient stress. Additionally, +it functions as a screening tool to identify urgent cases requiring further +assistance. We evaluated the framework using intrinsic metrics, such as +perplexity, and extrinsic evaluation metrics, with human participant +assessments of response helpfulness, fluency, relevance, and logic. The results +demonstrate the effectiveness of the Psy-LLM framework in generating coherent +and relevant answers to psychological questions. This article concludes by +discussing the potential of large language models to enhance mental health +support through AI technologies in online psychological consultation. + +
+
+
+
+
+ + ☆ Learning Vision-and-Language Navigation from YouTube Videos ICCV 2023 + + +
+ Vision-and-language navigation (VLN) requires an embodied agent to navigate +in realistic 3D environments using natural language instructions. Existing VLN +methods suffer from training on small-scale environments or unreasonable +path-instruction datasets, limiting the generalization to unseen environments. +There are massive house tour videos on YouTube, providing abundant real +navigation experiences and layout information. However, these videos have not +been explored for VLN before. In this paper, we propose to learn an agent from +these videos by creating a large-scale dataset which comprises reasonable +path-instruction pairs from house tour videos and pre-training the agent on it. +To achieve this, we have to tackle the challenges of automatically constructing +path-instruction pairs and exploiting real layout knowledge from raw and +unlabeled videos. To address these, we first leverage an entropy-based method +to construct the nodes of a path trajectory. Then, we propose an action-aware +generator for generating instructions from unlabeled trajectories. Last, we +devise a trajectory judgment pretext task to encourage the agent to mine the +layout knowledge. Experimental results show that our method achieves +state-of-the-art performance on two popular benchmarks (R2R and REVERIE). Code +is available at https://github.com/JeremyLinky/YouTube-VLN + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Practical and Ethical Challenges of Large Language Models in Education: + A Systematic Scoping Review + + +
+ Educational technology innovations leveraging large language models (LLMs) +have shown the potential to automate the laborious process of generating and +analysing textual content. While various innovations have been developed to +automate a range of educational tasks (e.g., question generation, feedback +provision, and essay grading), there are concerns regarding the practicality +and ethicality of these innovations. Such concerns may hinder future research +and the adoption of LLMs-based innovations in authentic educational contexts. +To address this, we conducted a systematic scoping review of 118 peer-reviewed +papers published since 2017 to pinpoint the current state of research on using +LLMs to automate and support educational tasks. The findings revealed 53 use +cases for LLMs in automating education tasks, categorised into nine main +categories: profiling/labelling, detection, grading, teaching support, +prediction, knowledge representation, feedback, content generation, and +recommendation. Additionally, we also identified several practical and ethical +challenges, including low technological readiness, lack of replicability and +transparency, and insufficient privacy and beneficence considerations. The +findings were summarised into three recommendations for future studies, +including updating existing innovations with state-of-the-art models (e.g., +GPT-3/4), embracing the initiative of open-sourcing models/systems, and +adopting a human-centred approach throughout the developmental process. As the +intersection of AI and education is continuously evolving, the findings of this +study can serve as an essential reference point for researchers, allowing them +to leverage the strengths, learn from the limitations, and uncover potential +research opportunities enabled by ChatGPT and other generative AI models. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Fertility Proposals Using Multi-Grained Topic + Analysis Methods + + +
+ Fertility issues are closely related to population security, in 60 years +China's population for the first time in a negative growth trend, the change of +fertility policy is of great concern to the community. 2023 "two sessions" +proposal "suggests that the country in the form of legislation, the birth of +the registration of the cancellation of the marriage restriction" This topic +was once a hot topic on the Internet, and "unbundling" the relationship between +birth registration and marriage has become the focus of social debate. In this +paper, we adopt co-occurrence semantic analysis, topic analysis and sentiment +analysis to conduct multi-granularity semantic analysis of microblog comments. +It is found that the discussion on the proposal of "removing marriage +restrictions from birth registration" involves the individual, society and the +state at three dimensions, and is detailed into social issues such as personal +behaviour, social ethics and law, and national policy, with people's sentiment +inclined to be negative in most of the topics. Based on this, eight proposals +were made to provide a reference for governmental decision making and to form a +reference method for researching public opinion on political issues. + +
+
+ comment: 7 pages, 4 figures, 1 table +
+
+
+
+
+ + ♻ ☆ A Survey of Knowledge Graph Reasoning on Graph Types: Static, Dynamic, + and Multimodal + + +
+ Knowledge graph reasoning (KGR), aiming to deduce new facts from existing +facts based on mined logic rules underlying knowledge graphs (KGs), has become +a fast-growing research direction. It has been proven to significantly benefit +the usage of KGs in many AI applications, such as question answering, +recommendation systems, and etc. According to the graph types, existing KGR +models can be roughly divided into three categories, i.e., static models, +temporal models, and multi-modal models. Early works in this domain mainly +focus on static KGR, and recent works try to leverage the temporal and +multi-modal information, which are more practical and closer to real-world. +However, no survey papers and open-source repositories comprehensively +summarize and discuss models in this important direction. To fill the gap, we +conduct a first survey for knowledge graph reasoning tracing from static to +temporal and then to multi-modal KGs. Concretely, the models are reviewed based +on bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques +and scenarios). Besides, the performances, as well as datasets, are summarized +and presented. Moreover, we point out the challenges and potential +opportunities to enlighten the readers. The corresponding open-source +repository is shared on GitHub +https://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Improved NL2SQL based on Multi-layer Expert Network + + +
+ The Natural Language to SQL (NL2SQL) technique is used to convert natural +language queries into executable SQL statements. Typically, slot-filling is +employed as a classification method for multi-task cases to achieve this goal. +However, slot-filling can result in inaccurate SQL statement generation due to +negative migration issues arising from different classification tasks. To +overcome this limitation, this study introduces a new approach called +Multi-Layer Expert Generate SQL (MLEG-SQL), which utilizes a dedicated +multi-task hierarchical network. The lower layer of the network extracts +semantic features of natural language statements, while the upper layer builds +a specialized expert system for handling specific classification tasks. This +hierarchical approach mitigates performance degradation resulting from +different task conflicts. The proposed method was evaluated on the WiKSQL +dataset and was found to be effective in generating accurate SQL statements. + +
+
+ comment: the paper's figure has something wrong +
+
+
+
+
+ + ♻ ☆ Integrating a Heterogeneous Graph with Entity-aware Self-attention using + Relative Position Labels for Reading Comprehension Model + + +
+ Despite the significant progress made by transformer models in machine +reading comprehension tasks, they still fall short in handling complex +reasoning tasks due to the absence of explicit knowledge in the input sequence. +To address this limitation, many recent works have proposed injecting external +knowledge into the model. However, selecting relevant external knowledge, +ensuring its availability, and requiring additional processing steps remain +challenging. In this paper, we introduce a novel attention pattern that +integrates reasoning knowledge derived from a heterogeneous graph into the +transformer architecture without relying on external knowledge. The proposed +attention pattern comprises three key elements: global-local attention for word +tokens, graph attention for entity tokens that exhibit strong attention towards +tokens connected in the graph as opposed to those unconnected, and the +consideration of the type of relationship between each entity token and word +token. This results in optimized attention between the two if a relationship +exists. The pattern is coupled with special relative position labels, allowing +it to integrate with LUKE's entity-aware self-attention mechanism. The +experimental findings corroborate that our model outperforms both the +cutting-edge LUKE-Graph and the baseline LUKE model on the ReCoRD dataset that +focuses on commonsense reasoning. + +
+
+ comment: submitted for Knowledge-Based Systems Journal +
+
+
+
+
+ + ♻ ☆ AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual + Similarity using Contrastive Learning and Structured Knowledge + + +
+ Generic sentence embeddings provide a coarse-grained approximation of +semantic textual similarity but ignore specific aspects that make texts +similar. Conversely, aspect-based sentence embeddings provide similarities +between texts based on certain predefined aspects. Thus, similarity predictions +of texts are more targeted to specific requirements and more easily +explainable. In this paper, we present AspectCSE, an approach for aspect-based +contrastive learning of sentence embeddings. Results indicate that AspectCSE +achieves an average improvement of 3.97% on information retrieval tasks across +multiple aspects compared to the previous best results. We also propose using +Wikidata knowledge graph properties to train models of multi-aspect sentence +embeddings in which multiple specific aspects are simultaneously considered +during similarity predictions. We demonstrate that multi-aspect embeddings +outperform single-aspect embeddings on aspect-specific information retrieval +tasks. Finally, we examine the aspect-based sentence embedding space and +demonstrate that embeddings of semantically similar aspect labels are often +close, even without explicit similarity training between different aspect +labels. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Efficient Domain Adaptation of Sentence Embeddings using Adapters + + +
+ Sentence embeddings enable us to capture the semantic similarity of short +texts. Most sentence embedding models are trained for general semantic textual +similarity (STS) tasks. Therefore, to use sentence embeddings in a particular +domain, the model must be adapted to it in order to achieve good results. +Usually, this is done by fine-tuning the entire sentence embedding model for +the domain of interest. While this approach yields state-of-the-art results, +all of the model's weights are updated during fine-tuning, making this method +resource-intensive. Therefore, instead of fine-tuning entire sentence embedding +models for each target domain individually, we propose to train lightweight +adapters. These domain-specific adapters do not require fine-tuning all +underlying sentence embedding model parameters. Instead, we only train a small +number of additional parameters while keeping the weights of the underlying +sentence embedding model fixed. Training domain-specific adapters allows always +using the same base model and only exchanging the domain-specific adapters to +adapt sentence embeddings to a specific domain. We show that using adapters for +parameter-efficient domain adaptation of sentence embeddings yields competitive +performance within 1% of a domain-adapted, entirely fine-tuned sentence +embedding model while only training approximately 3.6% of the parameters. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Generating Mathematical Derivations with Large Language Models + + +
+ The derivation of mathematical results in specialised fields using Large +Language Models (LLMs) is an emerging research direction that can help identify +models' limitations, and potentially support mathematical discovery. In this +paper, we leverage a symbolic engine to generate derivations of equations at +scale, and investigate the capabilities of LLMs when deriving goal equations +from premises. Specifically, we employ in-context learning for GPT and +fine-tune a range of T5 models to compare the robustness and generalisation of +pre-training strategies to specialised models. Empirical results show that +fine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and +out-of-distribution test sets in terms of absolute performance. However, an +in-depth analysis reveals that the fine-tuned models are more sensitive to +perturbations involving unseen symbols and (to a lesser extent) changes to +equation structure. In addition, we analyse 1.7K equations and over 200 +derivations to highlight common reasoning errors such as the inclusion of +incorrect, irrelevant, and redundant equations, along with the tendency to skip +derivation steps. Finally, we explore the suitability of existing metrics for +evaluating mathematical derivations finding evidence that, while they capture +general properties such as sensitivity to perturbations, they fail to highlight +fine-grained reasoning errors and essential differences between models. +Overall, this work demonstrates that training models on synthetic data can +improve their mathematical capabilities beyond larger architectures. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Detecting Harmful Agendas in News Articles ACL + + +
+ Manipulated news online is a growing problem which necessitates the use of +automated systems to curtail its spread. We argue that while misinformation and +disinformation detection have been studied, there has been a lack of investment +in the important open challenge of detecting harmful agendas in news articles; +identifying harmful agendas is critical to flag news campaigns with the +greatest potential for real world harm. Moreover, due to real concerns around +censorship, harmful agenda detectors must be interpretable to be effective. In +this work, we propose this new task and release a dataset, NewsAgendas, of +annotated news articles for agenda identification. We show how interpretable +systems can be effective on this task and demonstrate that they can perform +comparably to black-box models. + +
+
+ comment: Camera-ready for ACL-WASSA 2023 +
+
+
+
+
+ + ♻ ☆ Disco-Bench: A Discourse-Aware Evaluation Benchmark for Language + Modelling + + +
+ Modeling discourse -- the linguistic phenomena that go beyond individual +sentences, is a fundamental yet challenging aspect of natural language +processing (NLP). However, existing evaluation benchmarks primarily focus on +the evaluation of inter-sentence properties and overlook critical discourse +phenomena that cross sentences. To bridge the gap, we propose Disco-Bench, a +benchmark that can evaluate intra-sentence discourse properties across a +diverse set of NLP tasks, covering understanding, translation, and generation. +Disco-Bench consists of 9 document-level testsets in the literature domain, +which contain rich discourse phenomena (e.g. cohesion and coherence) in Chinese +and/or English. For linguistic analysis, we also design a diagnostic test suite +that can examine whether the target models learn discourse knowledge. We +totally evaluate 20 general-, in-domain and commercial models based on +Transformer, advanced pretraining architectures and large language models +(LLMs). Our results show (1) the challenge and necessity of our evaluation +benchmark; (2) fine-grained pretraining based on literary document-level +training data consistently improves the modeling of discourse information. We +will release the datasets, pretrained models, and leaderboard, which we hope +can significantly facilitate research in this field: +https://github.com/longyuewangdcu/Disco-Bench. + +
+
+ comment: Zhaopeng Tu is the corresponding author +
+
+
+
+
+
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Conformal Group Recommender System + + +
+ Group recommender systems (GRS) are critical in discovering relevant items +from a near-infinite inventory based on group preferences rather than +individual preferences, like recommending a movie, restaurant, or tourist +destination to a group of individuals. The traditional models of group +recommendation are designed to act like a black box with a strict focus on +improving recommendation accuracy, and most often, they place the onus on the +users to interpret recommendations. In recent years, the focus of Recommender +Systems (RS) research has shifted away from merely improving recommendation +accuracy towards value additions such as confidence and explanation. In this +work, we propose a conformal prediction framework that provides a measure of +confidence with prediction in conjunction with a group recommender system to +augment the system-generated plain recommendations. In the context of group +recommender systems, we propose various nonconformity measures that play a +vital role in the efficiency of the conformal framework. We also show that +defined nonconformity satisfies the exchangeability property. Experimental +results demonstrate the effectiveness of the proposed approach over several +benchmark datasets. Furthermore, our proposed approach also satisfies validity +and efficiency properties. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ XWalk: Random Walk Based Candidate Retrieval for Product Search + + +
+ In e-commerce, head queries account for the vast majority of gross +merchandise sales and improvements to head queries are highly impactful to the +business. While most supervised approaches to search perform better in head +queries vs. tail queries, we propose a method that further improves head query +performance dramatically. We propose XWalk, a random-walk based graph approach +to candidate retrieval for product search that borrows from recommendation +system techniques. XWalk is highly efficient to train and inference in a +large-scale high traffic e-commerce setting, and shows substantial improvements +in head query performance over state-of-the-art neural retreivers. Ensembling +XWalk with a neural and/or lexical retriever combines the best of both worlds +and the resulting retrieval system outperforms all other methods in both +offline relevance-based evaluation and in online A/B tests. + +
+
+
+
+
+ + ☆ HTP: Exploiting Holistic Temporal Patterns for Sequential Recommendation + + +
+ Sequential recommender systems have demonstrated a huge success for next-item +recommendation by explicitly exploiting the temporal order of users' historical +interactions. In practice, user interactions contain more useful temporal +information beyond order, as shown by some pioneering studies. In this paper, +we systematically investigate various temporal information for sequential +recommendation and identify three types of advantageous temporal patterns +beyond order, including absolute time information, relative item time intervals +and relative recommendation time intervals. We are the first to explore +item-oriented absolute time patterns. While existing models consider only one +or two of these three patterns, we propose a novel holistic temporal pattern +based neural network, named HTP, to fully leverage all these three patterns. In +particular, we introduce novel components to address the subtle correlations +between relative item time intervals and relative recommendation time +intervals, which render a major technical challenge. Extensive experiments on +three real-world benchmark datasets show that our HTP model consistently and +substantially outperforms many state-of-the-art models. Our code is publically +available at https://github.com/623851394/HTP/tree/main/HTP-main + +
+
+
+
+
+ + ☆ Collaborative Graph Neural Networks for Attributed Network Embedding + + +
+ Graph neural networks (GNNs) have shown prominent performance on attributed +network embedding. However, existing efforts mainly focus on exploiting network +structures, while the exploitation of node attributes is rather limited as they +only serve as node features at the initial layer. This simple strategy impedes +the potential of node attributes in augmenting node connections, leading to +limited receptive field for inactive nodes with few or even no neighbors. +Furthermore, the training objectives (i.e., reconstructing network structures) +of most GNNs also do not include node attributes, although studies have shown +that reconstructing node attributes is beneficial. Thus, it is encouraging to +deeply involve node attributes in the key components of GNNs, including graph +convolution operations and training objectives. However, this is a nontrivial +task since an appropriate way of integration is required to maintain the merits +of GNNs. To bridge the gap, in this paper, we propose COllaborative graph +Neural Networks--CONN, a tailored GNN architecture for attribute network +embedding. It improves model capacity by 1) selectively diffusing messages from +neighboring nodes and involved attribute categories, and 2) jointly +reconstructing node-to-node and node-to-attribute-category interactions via +cross-correlation. Experiments on real-world networks demonstrate that CONN +excels state-of-the-art embedding algorithms with a great margin. + +
+
+
+
+
+ + ♻ ☆ A Survey of Knowledge Graph Reasoning on Graph Types: Static, Dynamic, + and Multimodal + + +
+ Knowledge graph reasoning (KGR), aiming to deduce new facts from existing +facts based on mined logic rules underlying knowledge graphs (KGs), has become +a fast-growing research direction. It has been proven to significantly benefit +the usage of KGs in many AI applications, such as question answering, +recommendation systems, and etc. According to the graph types, existing KGR +models can be roughly divided into three categories, i.e., static models, +temporal models, and multi-modal models. Early works in this domain mainly +focus on static KGR, and recent works try to leverage the temporal and +multi-modal information, which are more practical and closer to real-world. +However, no survey papers and open-source repositories comprehensively +summarize and discuss models in this important direction. To fill the gap, we +conduct a first survey for knowledge graph reasoning tracing from static to +temporal and then to multi-modal KGs. Concretely, the models are reviewed based +on bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques +and scenarios). Besides, the performances, as well as datasets, are summarized +and presented. Moreover, we point out the challenges and potential +opportunities to enlighten the readers. The corresponding open-source +repository is shared on GitHub +https://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ EvalRS 2023. Well-Rounded Recommender Systems For Real-World Deployments KDD23 + + +
+ EvalRS aims to bring together practitioners from industry and academia to +foster a debate on rounded evaluation of recommender systems, with a focus on +real-world impact across a multitude of deployment scenarios. Recommender +systems are often evaluated only through accuracy metrics, which fall short of +fully characterizing their generalization capabilities and miss important +aspects, such as fairness, bias, usefulness, informativeness. This workshop +builds on the success of last year's workshop at CIKM, but with a broader scope +and an interactive format. + +
+
+ comment: EvalRS 2023 is a workshop at KDD23. Code and hackathon materials: + https://github.com/RecList/evalRS-KDD-2023 +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Real-Time Neural Video Recovery and Enhancement on Mobile Devices + + +
+ As mobile devices become increasingly popular for video streaming, it's +crucial to optimize the streaming experience for these devices. Although deep +learning-based video enhancement techniques are gaining attention, most of them +cannot support real-time enhancement on mobile devices. Additionally, many of +these techniques are focused solely on super-resolution and cannot handle +partial or complete loss or corruption of video frames, which is common on the +Internet and wireless networks. + To overcome these challenges, we present a novel approach in this paper. Our +approach consists of (i) a novel video frame recovery scheme, (ii) a new +super-resolution algorithm, and (iii) a receiver enhancement-aware video bit +rate adaptation algorithm. We have implemented our approach on an iPhone 12, +and it can support 30 frames per second (FPS). We have evaluated our approach +in various networks such as WiFi, 3G, 4G, and 5G networks. Our evaluation shows +that our approach enables real-time enhancement and results in a significant +increase in video QoE (Quality of Experience) of 24\% - 82\% in our video +streaming system. + +
+
+
+
+
+ + ♻ ☆ Neural Video Recovery for Cloud Gaming + + +
+ Cloud gaming is a multi-billion dollar industry. A client in cloud gaming +sends its movement to the game server on the Internet, which renders and +transmits the resulting video back. In order to provide a good gaming +experience, a latency below 80 ms is required. This means that video rendering, +encoding, transmission, decoding, and display have to finish within that time +frame, which is especially challenging to achieve due to server overload, +network congestion, and losses. In this paper, we propose a new method for +recovering lost or corrupted video frames in cloud gaming. Unlike traditional +video frame recovery, our approach uses game states to significantly enhance +recovery accuracy and utilizes partially decoded frames to recover lost +portions. We develop a holistic system that consists of (i) efficiently +extracting game states, (ii) modifying H.264 video decoder to generate a mask +to indicate which portions of video frames need recovery, and (iii) designing a +novel neural network to recover either complete or partial video frames. Our +approach is extensively evaluated using iPhone 12 and laptop implementations, +and we demonstrate the utility of game states in the game video recovery and +the effectiveness of our overall design. + +
+
+
+
+
+ + ♻ ☆ Learning to Pan-sharpening with Memories of Spatial Details + + +
+ Pan-sharpening, as one of the most commonly used techniques in remote sensing +systems, aims to inject spatial details from panchromatic images into +multispectral images (MS) to obtain high-resolution multispectral images. Since +deep learning has received widespread attention because of its powerful fitting +ability and efficient feature extraction, a variety of pan-sharpening methods +have been proposed to achieve remarkable performance. However, current +pan-sharpening methods usually require the paired panchromatic (PAN) and MS +images as input, which limits their usage in some scenarios. To address this +issue, in this paper we observe that the spatial details from PAN images are +mainly high-frequency cues, i.e., the edges reflect the contour of input PAN +images. This motivates us to develop a PAN-agnostic representation to store +some base edges, so as to compose the contour for the corresponding PAN image +via them. As a result, we can perform the pan-sharpening task with only the MS +image when inference. To this end, a memory-based network is adapted to extract +and memorize the spatial details during the training phase and is used to +replace the process of obtaining spatial information from PAN images when +inference, which is called Memory-based Spatial Details Network (MSDN). +Finally, we integrate the proposed MSDN module into the existing deep +learning-based pan-sharpening methods to achieve an end-to-end pan-sharpening +network. With extensive experiments on the Gaofen1 and WorldView-4 satellites, +we verify that our method constructs good spatial details without PAN images +and achieves the best performance. The code is available at +https://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 38 + +
+
+
+ + ☆ OUTFOX: LLM-generated Essay Detection through In-context Learning with + Adversarially Generated Examples + + +
+ Large Language Models (LLMs) have achieved human-level fluency in text +generation, making it difficult to distinguish between human-written and +LLM-generated texts. This poses a growing risk of misuse of LLMs and demands +the development of detectors to identify LLM-generated texts. However, existing +detectors degrade detection accuracy by simply paraphrasing LLM-generated +texts. Furthermore, the effectiveness of these detectors in real-life +situations, such as when students use LLMs for writing homework assignments +(e.g., essays) and quickly learn how to evade these detectors, has not been +explored. In this paper, we propose OUTFOX, a novel framework that improves the +robustness of LLM-generated-text detectors by allowing both the detector and +the attacker to consider each other's output and apply this to the domain of +student essays. In our framework, the attacker uses the detector's prediction +labels as examples for in-context learning and adversarially generates essays +that are harder to detect. While the detector uses the adversarially generated +essays as examples for in-context learning to learn to detect essays from a +strong attacker. Our experiments show that our proposed detector learned +in-context from the attacker improves the detection performance on the attacked +dataset by up to +41.3 point F1-score. While our proposed attacker can +drastically degrade the performance of the detector by up to -57.0 point +F1-score compared to the paraphrasing method. + +
+
+
+
+
+ + ☆ Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts + + +
+ Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have +revolutionized visual representation learning by providing good performance on +downstream datasets. VLMs are 0-shot adapted to a downstream dataset by +designing prompts that are relevant to the dataset. Such prompt engineering +makes use of domain expertise and a validation dataset. Meanwhile, recent +developments in generative pretrained models like GPT-4 mean they can be used +as advanced internet search tools. They can also be manipulated to provide +visual information in any structure. In this work, we show that GPT-4 can be +used to generate text that is visually descriptive and how this can be used to +adapt CLIP to downstream tasks. We show considerable improvements in 0-shot +transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD +(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt. +We also design a simple few-shot adapter that learns to choose the best +possible sentences to construct generalizable classifiers that outperform the +recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized +fine-grained datasets. We will release the code, prompts, and auxiliary text +dataset upon acceptance. + +
+
+ comment: 10 pages, Pre-print +
+
+
+
+
+ + ☆ OxfordTVG-HIC: Can Machine Make Humorous Captions from Images? ICCV 2023 + + +
+ This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale +dataset for humour generation and understanding. Humour is an abstract, +subjective, and context-dependent cognitive construct involving several +cognitive factors, making it a challenging task to generate and interpret. +Hence, humour generation and understanding can serve as a new task for +evaluating the ability of deep-learning methods to process abstract and +subjective information. Due to the scarcity of data, humour-related generation +tasks such as captioning remain under-explored. To address this gap, +OxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to +train a generalizable humour captioning model. Contrary to existing captioning +datasets, OxfordTVG-HIC features a wide range of emotional and semantic +diversity resulting in out-of-context examples that are particularly conducive +to generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive +content. We also show how OxfordTVG-HIC can be leveraged for evaluating the +humour of a generated text. Through explainability analysis of the trained +models, we identify the visual and linguistic cues influential for evoking +humour prediction (and generation). We observe qualitatively that these cues +are aligned with the benign violation theory of humour in cognitive psychology. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ CausE: Towards Causal Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) focuses on representing the entities and +relations of a knowledge graph (KG) into the continuous vector spaces, which +can be employed to predict the missing triples to achieve knowledge graph +completion (KGC). However, KGE models often only briefly learn structural +correlations of triple data and embeddings would be misled by the trivial +patterns and noisy links in real-world KGs. To address this issue, we build the +new paradigm of KGE in the context of causality and embedding disentanglement. +We further propose a Causality-enhanced knowledge graph Embedding (CausE) +framework. CausE employs causal intervention to estimate the causal effect of +the confounder embeddings and design new training objectives to make stable +predictions. Experimental results demonstrate that CausE could outperform the +baseline models and achieve state-of-the-art KGC performance. We release our +code in https://github.com/zjukg/CausE. + +
+
+
+
+
+ + ☆ A Change of Heart: Improving Speech Emotion Recognition through + Speech-to-Text Modality Conversion + + +
+ Speech Emotion Recognition (SER) is a challenging task. In this paper, we +introduce a modality conversion concept aimed at enhancing emotion recognition +performance on the MELD dataset. We assess our approach through two +experiments: first, a method named Modality-Conversion that employs automatic +speech recognition (ASR) systems, followed by a text classifier; second, we +assume perfect ASR output and investigate the impact of modality conversion on +SER, this method is called Modality-Conversion++. Our findings indicate that +the first method yields substantial results, while the second method +outperforms state-of-the-art (SOTA) speech-based approaches in terms of SER +weighted-F1 (WF1) score on the MELD dataset. This research highlights the +potential of modality conversion for tasks that can be conducted in alternative +modalities. + +
+
+
+
+
+ + ☆ Advancing Visual Grounding with Scene Knowledge: Benchmark and Method CVPR-2023 + + +
+ Visual grounding (VG) aims to establish fine-grained alignment between vision +and language. Ideally, it can be a testbed for vision-and-language models to +evaluate their understanding of the images and texts and their reasoning +abilities over their joint space. However, most existing VG datasets are +constructed using simple description texts, which do not require sufficient +reasoning over the images and texts. This has been demonstrated in a recent +study~\cite{luo2022goes}, where a simple LSTM-based text encoder without +pretraining can achieve state-of-the-art performance on mainstream VG datasets. +Therefore, in this paper, we propose a novel benchmark of \underline{S}cene +\underline{K}nowledge-guided \underline{V}isual \underline{G}rounding (SK-VG), +where the image content and referring expressions are not sufficient to ground +the target objects, forcing the models to have a reasoning ability on the +long-form scene knowledge. To perform this task, we propose two approaches to +accept the triple-type input, where the former embeds knowledge into the image +features before the image-query interaction; the latter leverages linguistic +structure to assist in computing the image-text matching. We conduct extensive +experiments to analyze the above methods and show that the proposed approaches +achieve promising results but still leave room for improvement, including +performance and interpretability. The dataset and code are available at +\url{https://github.com/zhjohnchan/SK-VG}. + +
+
+ comment: Computer Vision and Natural Language Processing. 21 pages, 14 + figures. CVPR-2023 +
+
+
+
+
+ + ☆ Bridging Vision and Language Encoders: Parameter-Efficient Tuning for + Referring Image Segmentation ICCV-2023 + + +
+ Parameter Efficient Tuning (PET) has gained attention for reducing the number +of parameters while maintaining performance and providing better hardware +resource savings, but few studies investigate dense prediction tasks and +interaction between modalities. In this paper, we do an investigation of +efficient tuning problems on referring image segmentation. We propose a novel +adapter called Bridger to facilitate cross-modal information exchange and +inject task-specific information into the pre-trained model. We also design a +lightweight decoder for image segmentation. Our approach achieves comparable or +superior performance with only 1.61\% to 3.38\% backbone parameter updates, +evaluated on challenging benchmarks. The code is available at +\url{https://github.com/kkakkkka/ETRIS}. + +
+
+ comment: Computer Vision and Natural Language Processing. 14 pages, 8 figures. + ICCV-2023 +
+
+
+
+
+ + ☆ IndigoVX: Where Human Intelligence Meets AI for Optimal Decision Making + + +
+ This paper defines a new approach for augmenting human intelligence with AI +for optimal goal solving. Our proposed AI, Indigo, is an acronym for Informed +Numerical Decision-making through Iterative Goal-Oriented optimization. When +combined with a human collaborator, we term the joint system IndigoVX, for +Virtual eXpert. The system is conceptually simple. We envisage this method +being applied to games or business strategies, with the human providing +strategic context and the AI offering optimal, data-driven moves. Indigo +operates through an iterative feedback loop, harnessing the human expert's +contextual knowledge and the AI's data-driven insights to craft and refine +strategies towards a well-defined goal. Using a quantified three-score schema, +this hybridization allows the combined team to evaluate strategies and refine +their plan, while adapting to challenges and changes in real-time. + +
+
+
+
+
+ + ☆ Incorporating Human Translator Style into English-Turkish Literary + Machine Translation + + +
+ Although machine translation systems are mostly designed to serve in the +general domain, there is a growing tendency to adapt these systems to other +domains like literary translation. In this paper, we focus on English-Turkish +literary translation and develop machine translation models that take into +account the stylistic features of translators. We fine-tune a pre-trained +machine translation model by the manually-aligned works of a particular +translator. We make a detailed analysis of the effects of manual and automatic +alignments, data augmentation methods, and corpus size on the translations. We +propose an approach based on stylistic features to evaluate the style of a +translator in the output translations. We show that the human translator style +can be highly recreated in the target machine translations by adapting the +models to the style of the translator. + +
+
+
+
+
+ + ☆ Topic Identification For Spontaneous Speech: Enriching Audio Features + With Embedded Linguistic Information + + +
+ Traditional topic identification solutions from audio rely on an automatic +speech recognition system (ASR) to produce transcripts used as input to a +text-based model. These approaches work well in high-resource scenarios, where +there are sufficient data to train both components of the pipeline. However, in +low-resource situations, the ASR system, even if available, produces +low-quality transcripts, leading to a bad text-based classifier. Moreover, +spontaneous speech containing hesitations can further degrade the performance +of the ASR model. In this paper, we investigate alternatives to the standard +text-only solutions by comparing audio-only and hybrid techniques of jointly +utilising text and audio features. The models evaluated on spontaneous Finnish +speech demonstrate that purely audio-based solutions are a viable option when +ASR components are not available, while the hybrid multi-modal solutions +achieve the best results. + +
+
+ comment: Accepted to EUSIPCO 2023 +
+
+
+
+
+ + ☆ MeetEval: A Toolkit for Computation of Word Error Rates for Meeting + Transcription Systems + + +
+ MeetEval is an open-source toolkit to evaluate all kinds of meeting +transcription systems. It provides a unified interface for the computation of +commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER +along other WER definitions. We extend the cpWER computation by a temporal +constraint to ensure that only words are identified as correct when the +temporal alignment is plausible. This leads to a better quality of the matching +of the hypothesis string to the reference string that more closely resembles +the actual transcription quality, and a system is penalized if it provides poor +time annotations. Since word-level timing information is often not available, +we present a way to approximate exact word-level timings from segment-level +timings (e.g., a sentence) and show that the approximation leads to a similar +WER as a matching with exact word-level annotations. At the same time, the time +constraint leads to a speedup of the matching algorithm, which outweighs the +additional overhead caused by processing the time stamps. + +
+
+ comment: Accepted for presentation at the Chime7 workshop 2023 +
+
+
+
+
+ + ☆ Is ChatGPT Involved in Texts? Measure the Polish Ratio to Detect + ChatGPT-Generated Text + + +
+ The remarkable capabilities of large-scale language models, such as ChatGPT, +in text generation have incited awe and spurred researchers to devise detectors +to mitigate potential risks, including misinformation, phishing, and academic +dishonesty. Despite this, most previous studies, including HC3, have been +predominantly geared towards creating detectors that differentiate between +purely ChatGPT-generated texts and human-authored texts. This approach, +however, fails to work on discerning texts generated through human-machine +collaboration, such as ChatGPT-polished texts. Addressing this gap, we +introduce a novel dataset termed HPPT (ChatGPT-polished academic abstracts), +facilitating the construction of more robust detectors. It diverges from extant +corpora by comprising pairs of human-written and ChatGPT-polished abstracts +instead of purely ChatGPT-generated texts. Additionally, we propose the "Polish +Ratio" method, an innovative measure of ChatGPT's involvement in text +generation based on editing distance. It provides a mechanism to measure the +degree of human originality in the resulting text. Our experimental results +show our proposed model has better robustness on the HPPT dataset and two +existing datasets (HC3 and CDB). Furthermore, the "Polish Ratio" we proposed +offers a more comprehensive explanation by quantifying the degree of ChatGPT +involvement, which indicates that a Polish Ratio value greater than 0.2 +signifies ChatGPT involvement and a value exceeding 0.6 implies that ChatGPT +generates most of the text. + +
+
+
+
+
+ + ☆ CohortGPT: An Enhanced GPT for Participant Recruitment in Clinical Study + + +
+ Participant recruitment based on unstructured medical texts such as clinical +notes and radiology reports has been a challenging yet important task for the +cohort establishment in clinical research. Recently, Large Language Models +(LLMs) such as ChatGPT have achieved tremendous success in various downstream +tasks thanks to their promising performance in language understanding, +inference, and generation. It is then natural to test their feasibility in +solving the cohort recruitment task, which involves the classification of a +given paragraph of medical text into disease label(s). However, when applied to +knowledge-intensive problem settings such as medical text classification, where +the LLMs are expected to understand the decision made by human experts and +accurately identify the implied disease labels, the LLMs show a mediocre +performance. A possible explanation is that, by only using the medical text, +the LLMs neglect to use the rich context of additional information that +languages afford. To this end, we propose to use a knowledge graph as auxiliary +information to guide the LLMs in making predictions. Moreover, to further boost +the LLMs adapt to the problem setting, we apply a chain-of-thought (CoT) sample +selection strategy enhanced by reinforcement learning, which selects a set of +CoT samples given each individual medical report. Experimental results and +various ablation studies show that our few-shot learning method achieves +satisfactory performance compared with fine-tuning strategies and gains superb +advantages when the available data is limited. The code and sample dataset of +the proposed CohortGPT model is available at: +https://anonymous.4open.science/r/CohortGPT-4872/ + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ DEFTri: A Few-Shot Label Fused Contextual Representation Learning For + Product Defect Triage in e-Commerce + + +
+ Defect Triage is a time-sensitive and critical process in a large-scale agile +software development lifecycle for e-commerce. Inefficiencies arising from +human and process dependencies in this domain have motivated research in +automated approaches using machine learning to accurately assign defects to +qualified teams. This work proposes a novel framework for automated defect +triage (DEFTri) using fine-tuned state-of-the-art pre-trained BERT on labels +fused text embeddings to improve contextual representations from +human-generated product defects. For our multi-label text classification defect +triage task, we also introduce a Walmart proprietary dataset of product defects +using weak supervision and adversarial learning, in a few-shot setting. + +
+
+ comment: In Proceedings of the Fifth Workshop on e-Commerce and NLP ECNLP 5 + 2022 Pages 1-7 +
+
+
+
+
+ + ☆ Making Pre-trained Language Models both Task-solvers and + Self-calibrators ACL 2023 + + +
+ Pre-trained language models (PLMs) serve as backbones for various real-world +systems. For high-stake applications, it's equally essential to have reasonable +confidence estimations in predictions. While the vanilla confidence scores of +PLMs can already be effectively utilized, PLMs consistently become +overconfident in their wrong predictions, which is not desirable in practice. +Previous work shows that introducing an extra calibration task can mitigate +this issue. The basic idea involves acquiring additional data to train models +in predicting the confidence of their initial predictions. However, it only +demonstrates the feasibility of this kind of method, assuming that there are +abundant extra available samples for the introduced calibration task. In this +work, we consider the practical scenario that we need to effectively utilize +training samples to make PLMs both task-solvers and self-calibrators. Three +challenges are presented, including limited training samples, data imbalance, +and distribution shifts. We first conduct pilot experiments to quantify various +decisive factors in the calibration task. Based on the empirical analysis +results, we propose a training algorithm LM-TOAST to tackle the challenges. +Experimental results show that LM-TOAST can effectively utilize the training +data to make PLMs have reasonable confidence estimations while maintaining the +original task performance. Further, we consider three downstream applications, +namely selective classification, adversarial defense, and model cascading, to +show the practical usefulness of LM-TOAST. The code will be made public at +\url{https://github.com/Yangyi-Chen/LM-TOAST}. + +
+
+ comment: Accepted to Findings of ACL 2023 +
+
+
+
+
+ + ☆ Generating Image-Specific Text Improves Fine-grained Image + Classification + + +
+ Recent vision-language models outperform vision-only models on many image +classification tasks. However, because of the absence of paired text/image +descriptions, it remains difficult to fine-tune these models for fine-grained +image classification. In this work, we propose a method, GIST, for generating +image-specific fine-grained text descriptions from image-only datasets, and +show that these text descriptions can be used to improve classification. Key +parts of our method include 1. prompting a pretrained large language model with +domain-specific prompts to generate diverse fine-grained text descriptions for +each class and 2. using a pretrained vision-language model to match each image +to label-preserving text descriptions that capture relevant visual features in +the image. We demonstrate the utility of GIST by fine-tuning vision-language +models on the image-and-generated-text pairs to learn an aligned +vision-language representation space for improved classification. We evaluate +our learned representation space in full-shot and few-shot scenarios across +four diverse fine-grained classification datasets, each from a different +domain. Our method achieves an average improvement of $4.1\%$ in accuracy over +CLIP linear probes and an average of $1.1\%$ improvement in accuracy over the +previous state-of-the-art image-text classification method on the full-shot +datasets. Our method achieves similar improvements across few-shot regimes. +Code is available at https://github.com/emu1729/GIST. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ Generator-Retriever-Generator: A Novel Approach to Open-domain Question + Answering + + +
+ Open-domain question answering (QA) tasks usually require the retrieval of +relevant information from a large corpus to generate accurate answers. We +propose a novel approach called Generator-Retriever-Generator (GRG) that +combines document retrieval techniques with a large language model (LLM), by +first prompting the model to generate contextual documents based on a given +question. In parallel, a dual-encoder network retrieves documents that are +relevant to the question from an external corpus. The generated and retrieved +documents are then passed to the second LLM, which generates the final answer. +By combining document retrieval and LLM generation, our approach addresses the +challenges of open-domain QA, such as generating informative and contextually +relevant answers. GRG outperforms the state-of-the-art generate-then-read and +retrieve-then-read pipelines (GENREAD and RFiD) improving their performance at +least by +5.2, +4.2, and +1.6 on TriviaQA, NQ, and WebQ datasets, respectively. +We provide code, datasets, and checkpoints +\footnote{\url{https://github.com/abdoelsayed2016/GRG}} + +
+
+
+
+
+ + ☆ Selective Perception: Optimizing State Descriptions with Reinforcement + Learning for Language Model Actors + + +
+ Large language models (LLMs) are being applied as actors for sequential +decision making tasks in domains such as robotics and games, utilizing their +general world knowledge and planning abilities. However, previous work does +little to explore what environment state information is provided to LLM actors +via language. Exhaustively describing high-dimensional states can impair +performance and raise inference costs for LLM actors. Previous LLM actors avoid +the issue by relying on hand-engineered, task-specific protocols to determine +which features to communicate about a state and which to leave out. In this +work, we propose Brief Language INputs for DEcision-making Responses (BLINDER), +a method for automatically selecting concise state descriptions by learning a +value function for task-conditioned state descriptions. We evaluate BLINDER on +the challenging video game NetHack and a robotic manipulation task. Our method +improves task success rate, reduces input size and compute costs, and +generalizes between LLM actors. + +
+
+
+
+
+ + ☆ CARTIER: Cartographic lAnguage Reasoning Targeted at Instruction + Execution for Robots + + +
+ This work explores the capacity of large language models (LLMs) to address +problems at the intersection of spatial planning and natural language +interfaces for navigation.Our focus is on following relatively complex +instructions that are more akin to natural conversation than traditional +explicit procedural directives seen in robotics. Unlike most prior work, where +navigation directives are provided as imperative commands (e.g., go to the +fridge), we examine implicit directives within conversational interactions. We +leverage the 3D simulator AI2Thor to create complex and repeatable scenarios at +scale, and augment it by adding complex language queries for 40 object types. +We demonstrate that a robot can better parse descriptive language queries than +existing methods by using an LLM to interpret the user interaction in the +context of a list of the objects in the scene. + +
+
+
+
+
+ + ☆ The Looming Threat of Fake and LLM-generated LinkedIn Profiles: + Challenges and Opportunities for Detection and Prevention + + +
+ In this paper, we present a novel method for detecting fake and Large +Language Model (LLM)-generated profiles in the LinkedIn Online Social Network +immediately upon registration and before establishing connections. Early fake +profile identification is crucial to maintaining the platform's integrity since +it prevents imposters from acquiring the private and sensitive information of +legitimate users and from gaining an opportunity to increase their credibility +for future phishing and scamming activities. This work uses textual information +provided in LinkedIn profiles and introduces the Section and Subsection Tag +Embedding (SSTE) method to enhance the discriminative characteristics of these +data for distinguishing between legitimate profiles and those created by +imposters manually or by using an LLM. Additionally, the dearth of a large +publicly available LinkedIn dataset motivated us to collect 3600 LinkedIn +profiles for our research. We will release our dataset publicly for research +purposes. This is, to the best of our knowledge, the first large publicly +available LinkedIn dataset for fake LinkedIn account detection. Within our +paradigm, we assess static and contextualized word embeddings, including GloVe, +Flair, BERT, and RoBERTa. We show that the suggested method can distinguish +between legitimate and fake profiles with an accuracy of about 95% across all +word embeddings. In addition, we show that SSTE has a promising accuracy for +identifying LLM-generated profiles, despite the fact that no LLM-generated +profiles were employed during the training phase, and can achieve an accuracy +of approximately 90% when only 20 LLM-generated profiles are added to the +training set. It is a significant finding since the proliferation of several +LLMs in the near future makes it extremely challenging to design a single +system that can identify profiles created with various LLMs. + +
+
+ comment: 33rd ACM Conference on Hypertext and Social Media (HT '23) +
+
+
+
+
+ + ☆ MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through + Multi-Answer Open-Domain Question Answering SIGIR 2023 + + +
+ Check-worthy claim detection aims at providing plausible misinformation to +downstream fact-checking systems or human experts to check. This is a crucial +step toward accelerating the fact-checking process. Many efforts have been put +into how to identify check-worthy claims from a small scale of pre-collected +claims, but how to efficiently detect check-worthy claims directly from a +large-scale information source, such as Twitter, remains underexplored. To fill +this gap, we introduce MythQA, a new multi-answer open-domain question +answering(QA) task that involves contradictory stance mining for query-based +large-scale check-worthy claim detection. The idea behind this is that +contradictory claims are a strong indicator of misinformation that merits +scrutiny by the appropriate authorities. To study this task, we construct +TweetMythQA, an evaluation dataset containing 522 factoid multi-answer +questions based on controversial topics. Each question is annotated with +multiple answers. Moreover, we collect relevant tweets for each distinct +answer, then classify them into three categories: "Supporting", "Refuting", and +"Neutral". In total, we annotated 5.3K tweets. Contradictory evidence is +collected for all answers in the dataset. Finally, we present a baseline system +for MythQA and evaluate existing NLP models for each system component using the +TweetMythQA dataset. We provide initial benchmarks and identify key challenges +for future models to improve upon. Code and data are available at: +https://github.com/TonyBY/Myth-QA + +
+
+ comment: Accepted by SIGIR 2023 +
+
+
+
+
+ + ☆ Multimodal Document Analytics for Banking Process Automation + + +
+ In response to growing FinTech competition and the need for improved +operational efficiency, this research focuses on understanding the potential of +advanced document analytics, particularly using multimodal models, in banking +processes. We perform a comprehensive analysis of the diverse banking document +landscape, highlighting the opportunities for efficiency gains through +automation and advanced analytics techniques in the customer business. Building +on the rapidly evolving field of natural language processing (NLP), we +illustrate the potential of models such as LayoutXLM, a cross-lingual, +multimodal, pre-trained model, for analyzing diverse documents in the banking +sector. This model performs a text token classification on German company +register extracts with an overall F1 score performance of around 80\%. Our +empirical evidence confirms the critical role of layout information in +improving model performance and further underscores the benefits of integrating +image information. Interestingly, our study shows that over 75% F1 score can be +achieved with only 30% of the training data, demonstrating the efficiency of +LayoutXLM. Through addressing state-of-the-art document analysis frameworks, +our study aims to enhance process efficiency and demonstrate the real-world +applicability and benefits of multimodal models within banking. + +
+
+ comment: A Preprint +
+
+
+
+
+ + ☆ Prompting Large Language Models with Speech Recognition Abilities + + +
+ Large language models have proven themselves highly flexible, able to solve a +wide range of generative tasks, such as abstractive summarization and +open-ended question answering. In this paper we extend the capabilities of LLMs +by directly attaching a small audio encoder allowing it to perform speech +recognition. By directly prepending a sequence of audial embeddings to the text +token embeddings, the LLM can be converted to an automatic speech recognition +(ASR) system, and be used in the exact same manner as its textual counterpart. +Experiments on Multilingual LibriSpeech (MLS) show that incorporating a +conformer encoder into the open sourced LLaMA-7B allows it to outperform +monolingual baselines by 18% and perform multilingual speech recognition +despite LLaMA being trained overwhelmingly on English text. Furthermore, we +perform ablation studies to investigate whether the LLM can be completely +frozen during training to maintain its original capabilities, scaling up the +audio encoder, and increasing the audio encoder striding to generate fewer +embeddings. The results from these studies show that multilingual ASR is +possible even when the LLM is frozen or when strides of almost 1 second are +used in the audio encoder opening up the possibility for LLMs to operate on +long-form audio. + +
+
+
+
+
+ + ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in + Multi-Modal LLMs + + +
+ We demonstrate how images and sounds can be used for indirect prompt and +instruction injection in multi-modal LLMs. An attacker generates an adversarial +perturbation corresponding to the prompt and blends it into an image or audio +recording. When the user asks the (unmodified, benign) model about the +perturbed image or audio, the perturbation steers the model to output the +attacker-chosen text and/or make the subsequent dialog follow the attacker's +instruction. We illustrate this attack with several proof-of-concept examples +targeting LLaVa and PandaGPT. + +
+
+
+
+
+ + ♻ ☆ Going Beyond Local: Global Graph-Enhanced Personalized News + Recommendations + + +
+ Precisely recommending candidate news articles to users has always been a +core challenge for personalized news recommendation systems. Most recent works +primarily focus on using advanced natural language processing techniques to +extract semantic information from rich textual data, employing content-based +methods derived from local historical news. However, this approach lacks a +global perspective, failing to account for users' hidden motivations and +behaviors beyond semantic information. To address this challenge, we propose a +novel model called GLORY (Global-LOcal news Recommendation sYstem), which +combines global representations learned from other users with local +representations to enhance personalized recommendation systems. We accomplish +this by constructing a Global-aware Historical News Encoder, which includes a +global news graph and employs gated graph neural networks to enrich news +representations, thereby fusing historical news representations by a historical +news aggregator. Similarly, we extend this approach to a Global Candidate News +Encoder, utilizing a global entity graph and a candidate news aggregator to +enhance candidate news representation. Evaluation results on two public news +datasets demonstrate that our method outperforms existing approaches. +Furthermore, our model offers more diverse recommendations. + +
+
+ comment: 10 pages, Recsys 2023 +
+
+
+
+
+ + ♻ ☆ NusaCrowd: Open Source Initiative for Indonesian NLP Resources + + +
+ We present NusaCrowd, a collaborative initiative to collect and unify +existing resources for Indonesian languages, including opening access to +previously non-public resources. Through this initiative, we have brought +together 137 datasets and 118 standardized data loaders. The quality of the +datasets has been assessed manually and automatically, and their value is +demonstrated through multiple experiments. NusaCrowd's data collection enables +the creation of the first zero-shot benchmarks for natural language +understanding and generation in Indonesian and the local languages of +Indonesia. Furthermore, NusaCrowd brings the creation of the first multilingual +automatic speech recognition benchmark in Indonesian and the local languages of +Indonesia. Our work strives to advance natural language processing (NLP) +research for languages that are under-represented despite being widely spoken. + +
+
+
+
+
+ + ♻ ☆ ClueReader: Heterogeneous Graph Attention Network for Multi-hop Machine + Reading Comprehension + + +
+ Multi-hop machine reading comprehension is a challenging task in natural +language processing as it requires more reasoning ability across multiple +documents. Spectral models based on graph convolutional networks have shown +good inferring abilities and lead to competitive results. However, the analysis +and reasoning of some are inconsistent with those of humans. Inspired by the +concept of grandmother cells in cognitive neuroscience, we propose a +heterogeneous graph attention network model named ClueReader to imitate the +grandmother cell concept. The model is designed to assemble the semantic +features in multi-level representations and automatically concentrate or +alleviate information for reasoning through the attention mechanism. The name +ClueReader is a metaphor for the pattern of the model: it regards the subjects +of queries as the starting points of clues, takes the reasoning entities as +bridge points, considers the latent candidate entities as grandmother cells, +and the clues end up in candidate entities. The proposed model enables the +visualization of the reasoning graph, making it possible to analyze the +importance of edges connecting entities and the selectivity in the mention and +candidate nodes, which is easier to comprehend empirically. Evaluations on the +open-domain multi-hop reading dataset WikiHop and drug-drug interaction dataset +MedHop proved the validity of ClueReader and showed the feasibility of its +application of the model in the molecular biology domain. + +
+
+
+
+
+ + ♻ ☆ Forecasting consumer confidence through semantic network analysis of + online news + + +
+ This research studies the impact of online news on social and economic +consumer perceptions through semantic network analysis. Using over 1.8 million +online articles on Italian media covering four years, we calculate the semantic +importance of specific economic-related keywords to see if words appearing in +the articles could anticipate consumers' judgments about the economic situation +and the Consumer Confidence Index. We use an innovative approach to analyze big +textual data, combining methods and tools of text mining and social network +analysis. Results show a strong predictive power for the judgments about the +current households and national situation. Our indicator offers a complementary +approach to estimating consumer confidence, lessening the limitations of +traditional survey-based methods. + +
+
+
+
+
+ + ♻ ☆ Enhancing Coherence of Extractive Summarization with Multitask Learning + + +
+ This study proposes a multitask learning architecture for extractive +summarization with coherence boosting. The architecture contains an extractive +summarizer and coherent discriminator module. The coherent discriminator is +trained online on the sentence vectors of the augmented textual input, thus +improving its general ability of judging whether the input sentences are +coherent. Meanwhile, we maximize the coherent scores from the coherent +discriminator by updating the parameters of the summarizer. To make the +extractive sentences trainable in a differentiable manner, we introduce two +strategies, including pre-trained converting model (model-based) and converting +matrix (MAT-based) that merge sentence representations. Experiments show that +our proposed method significantly improves the proportion of consecutive +sentences in the extracted summaries based on their positions in the original +article (i.e., automatic sentence-level coherence metric), while the goodness +in terms of other automatic metrics (i.e., Rouge scores and BertScores) are +preserved. Human evaluation also evidences the improvement of coherence and +consistency of the extracted summaries given by our method. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Chinese Fine-Grained Financial Sentiment Analysis with Large Language + Models IJCAI 2023 + + +
+ Entity-level fine-grained sentiment analysis in the financial domain is a +crucial subtask of sentiment analysis and currently faces numerous challenges. +The primary challenge stems from the lack of high-quality and large-scale +annotated corpora specifically designed for financial text sentiment analysis, +which in turn limits the availability of data necessary for developing +effective text processing techniques. Recent advancements in large language +models (LLMs) have yielded remarkable performance in natural language +processing tasks, primarily centered around language pattern matching. In this +paper, we propose a novel and extensive Chinese fine-grained financial +sentiment analysis dataset, FinChina SA, for enterprise early warning. We +thoroughly evaluate and experiment with well-known existing open-source LLMs +using our dataset. We firmly believe that our dataset will serve as a valuable +resource to advance the exploration of real-world financial sentiment analysis +tasks, which should be the focus of future research. Our dataset and all code +to replicate the experimental results will be released. + +
+
+ comment: FinLLM Symposium at IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ Large Language Model Augmented Narrative Driven Recommendations RecSys 2023 + + +
+ Narrative-driven recommendation (NDR) presents an information access problem +where users solicit recommendations with verbose descriptions of their +preferences and context, for example, travelers soliciting recommendations for +points of interest while describing their likes/dislikes and travel +circumstances. These requests are increasingly important with the rise of +natural language-based conversational interfaces for search and recommendation +systems. However, NDR lacks abundant training data for models, and current +platforms commonly do not support these requests. Fortunately, classical +user-item interaction datasets contain rich textual data, e.g., reviews, which +often describe user preferences and context - this may be used to bootstrap +training for NDR models. In this work, we explore using large language models +(LLMs) for data augmentation to train NDR models. We use LLMs for authoring +synthetic narrative queries from user-item interactions with few-shot prompting +and train retrieval models for NDR on synthetic queries and user-item +interaction data. Our experiments demonstrate that this is an effective +strategy for training small-parameter retrieval models that outperform other +retrieval and LLM baselines for narrative-driven recommendation. + +
+
+ comment: RecSys 2023 Camera-ready +
+
+
+
+
+ + ♻ ☆ Editable User Profiles for Controllable Text Recommendation SIGIR-2023 + + +
+ Methods for making high-quality recommendations often rely on learning latent +representations from interaction data. These methods, while performant, do not +provide ready mechanisms for users to control the recommendation they receive. +Our work tackles this problem by proposing LACE, a novel concept value +bottleneck model for controllable text recommendations. LACE represents each +user with a succinct set of human-readable concepts through retrieval given +user-interacted documents and learns personalized representations of the +concepts based on user documents. This concept based user profile is then +leveraged to make recommendations. The design of our model affords control over +the recommendations through a number of intuitive interactions with a +transparent user profile. We first establish the quality of recommendations +obtained from LACE in an offline evaluation on three recommendation tasks +spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we +validate the controllability of LACE under simulated user interactions. +Finally, we implement LACE in an interactive controllable recommender system +and conduct a user study to demonstrate that users are able to improve the +quality of recommendations they receive through interactions with an editable +user profile. + +
+
+ comment: SIGIR-2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ GPT-FinRE: In-context Learning for Financial Relation Extraction using + Large Language Models + + +
+ Relation extraction (RE) is a crucial task in natural language processing +(NLP) that aims to identify and classify relationships between entities +mentioned in text. In the financial domain, relation extraction plays a vital +role in extracting valuable information from financial documents, such as news +articles, earnings reports, and company filings. This paper describes our +solution to relation extraction on one such dataset REFinD. The dataset was +released along with shared task as a part of the Fourth Workshop on Knowledge +Discovery from Unstructured Data in Financial Services, co-located with SIGIR +2023. In this paper, we employed OpenAI models under the framework of +in-context learning (ICL). We utilized two retrieval strategies to find top K +relevant in-context learning demonstrations / examples from training data for a +given test example. The first retrieval mechanism, we employed, is a +learning-free dense retriever and the other system is a learning-based +retriever. We were able to achieve 3rd rank overall. Our best F1-score is +0.718. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2305.02105 by other authors +
+
+
+
+
+ + ♻ ☆ Mutual Reinforcement Effects in Japanese Sentence Classification and + Named Entity Recognition Tasks + + +
+ Information extraction(IE) is a crucial subfield within natural language +processing. However, for the traditionally segmented approach to sentence +classification and Named Entity Recognition, the intricate interactions between +these individual subtasks remain largely uninvestigated. In this study, we +propose an integrative analysis, converging sentence classification with Named +Entity Recognition, with the objective to unveil and comprehend the mutual +reinforcement effect within these two information extraction subtasks. To +achieve this, we introduce a Sentence Classification and Named Entity +Recognition Multi-task (SCNM) approach that combines Sentence Classification +(SC) and Named Entity Recognition (NER). We develop a Sentence-to-Label +Generation (SLG) framework for SCNM and construct a Wikipedia dataset +containing both SC and NER. Using a format converter, we unify input formats +and employ a generative model to generate SC-labels, NER-labels, and associated +text segments. We propose a Constraint Mechanism (CM) to improve generated +format accuracy. Our results show SC accuracy increased by 1.13 points and NER +by 1.06 points in SCNM compared to standalone tasks, with CM raising format +accuracy from 63.61 to 100. The findings indicate mutual reinforcement effects +between SC and NER, and integration enhances both tasks' performance. We +additionally implemented the SLG framework on single SC task. It yielded +superior accuracies compared to the baseline on two distinct Japanese SC +datasets. Notably, in the experiment of few-shot learning, SLG framework shows +much better performance than fine-tune method. These empirical findings +contribute additional evidence to affirm the efficacy of the SLG framework. + +
+
+ comment: 25 pages, 12 figures, 19 tables. arXiv admin note: substantial text + overlap with arXiv:2306.15978 +
+
+
+
+
+ + ♻ ☆ PharmacyGPT: The AI Pharmacist + + +
+ In this study, we introduce PharmacyGPT, a novel framework to assess the +capabilities of large language models (LLMs) such as ChatGPT and GPT-4 in +emulating the role of clinical pharmacists. Our methodology encompasses the +utilization of LLMs to generate comprehensible patient clusters, formulate +medication plans, and forecast patient outcomes. We conduct our investigation +using real data acquired from the intensive care unit (ICU) at the University +of North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable +insights into the potential applications and limitations of LLMs in the field +of clinical pharmacy, with implications for both patient care and the +development of future AI-driven healthcare solutions. By evaluating the +performance of PharmacyGPT, we aim to contribute to the ongoing discourse +surrounding the integration of artificial intelligence in healthcare settings, +ultimately promoting the responsible and efficacious use of such technologies. + +
+
+
+
+
+ + ♻ ☆ Towards Robust Aspect-based Sentiment Analysis through + Non-counterfactual Augmentations + + +
+ While state-of-the-art NLP models have demonstrated excellent performance for +aspect based sentiment analysis (ABSA), substantial evidence has been presented +on their lack of robustness. This is especially manifested as significant +degradation in performance when faced with out-of-distribution data. Recent +solutions that rely on counterfactually augmented datasets show promising +results, but they are inherently limited because of the lack of access to +explicit causal structure. In this paper, we present an alternative approach +that relies on non-counterfactual data augmentation. Our proposal instead +relies on using noisy, cost-efficient data augmentations that preserve +semantics associated with the target aspect. Our approach then relies on +modelling invariances between different versions of the data to improve +robustness. A comprehensive suite of experiments shows that our proposal +significantly improves upon strong pre-trained baselines on both standard and +robustness-specific datasets. Our approach further establishes a new +state-of-the-art on the ABSA robustness benchmark and transfers well across +domains. + +
+
+ comment: 10pages,1 figure,10 tables +
+
+
+
+
+ + ♻ ☆ Toward expanding the scope of radiology report summarization to multiple + anatomies and modalities + + +
+ Radiology report summarization (RRS) is a growing area of research. Given the +Findings section of a radiology report, the goal is to generate a summary +(called an Impression section) that highlights the key observations and +conclusions of the radiology study. However, RRS currently faces essential +limitations.First, many prior studies conduct experiments on private datasets, +preventing reproduction of results and fair comparisons across different +systems and solutions. Second, most prior approaches are evaluated solely on +chest X-rays. To address these limitations, we propose a dataset (MIMIC-RRS) +involving three new modalities and seven new anatomies based on the MIMIC-III +and MIMIC-CXR datasets. We then conduct extensive experiments to evaluate the +performance of models both within and across modality-anatomy pairs in +MIMIC-RRS. In addition, we evaluate their clinical efficacy via RadGraph, a +factual correctness metric. + +
+
+
+
+
+ + ♻ ☆ Inference-Time Intervention: Eliciting Truthful Answers from a Language + Model + + +
+ We introduce Inference-Time Intervention (ITI), a technique designed to +enhance the truthfulness of large language models (LLMs). ITI operates by +shifting model activations during inference, following a set of directions +across a limited number of attention heads. This intervention significantly +improves the performance of LLaMA models on the TruthfulQA benchmark. On an +instruction-finetuned LLaMA called Alpaca, ITI improves its truthfulness from +32.5% to 65.1%. We identify a tradeoff between truthfulness and helpfulness and +demonstrate how to balance it by tuning the intervention strength. ITI is +minimally invasive and computationally inexpensive. Moreover, the technique is +data efficient: while approaches like RLHF require extensive annotations, ITI +locates truthful directions using only few hundred examples. Our findings +suggest that LLMs may have an internal representation of the likelihood of +something being true, even as they produce falsehoods on the surface. + +
+
+ comment: code: https://github.com/likenneth/honest_llama +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 89 + +
+
+
+ + ☆ BandRe: Rethinking Band-Pass Filters for Scale-Wise Object Detection + Evaluation + + +
+ Scale-wise evaluation of object detectors is important for real-world +applications. However, existing metrics are either coarse or not sufficiently +reliable. In this paper, we propose novel scale-wise metrics that strike a +balance between fineness and reliability, using a filter bank consisting of +triangular and trapezoidal band-pass filters. We conduct experiments with two +methods on two datasets and show that the proposed metrics can highlight the +differences between the methods and between the datasets. Code is available at +https://github.com/shinya7y/UniverseNet . + +
+
+ comment: Honorable Mention Solution Award in Small Object Detection Challenge + for Spotting Birds, International Conference on Machine Vision Applications + (MVA) 2023 +
+
+
+
+
+ + ☆ 3D Skeletonization of Complex Grapevines for Robotic Pruning IROS 2023 + + +
+ Robotic pruning of dormant grapevines is an area of active research in order +to promote vine balance and grape quality, but so far robotic efforts have +largely focused on planar, simplified vines not representative of commercial +vineyards. This paper aims to advance the robotic perception capabilities +necessary for pruning in denser and more complex vine structures by extending +plant skeletonization techniques. The proposed pipeline generates skeletal +grapevine models that have lower reprojection error and higher connectivity +than baseline algorithms. We also show how 3D and skeletal information enables +prediction accuracy of pruning weight for dense vines surpassing prior work, +where pruning weight is an important vine metric influencing pruning site +selection. + +
+
+ comment: 6 pages, IROS 2023 Computer Vision for Automation +
+
+
+
+
+ + ☆ SACReg: Scene-Agnostic Coordinate Regression for Visual Localization + + +
+ Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every +pixel of a given image, has recently shown promising potential. However, +existing methods remain mostly scene-specific or limited to small scenes and +thus hardly scale to realistic datasets. In this paper, we propose a new +paradigm where a single generic SCR model is trained once to be then deployed +to new test scenes, regardless of their scale and without further finetuning. +For a given query image, it collects inputs from off-the-shelf image retrieval +techniques and Structure-from-Motion databases: a list of relevant database +images with sparse pointwise 2D-3D annotations. The model is based on the +transformer architecture and can take a variable number of images and sparse +2D-3D annotations as input. It is trained on a few diverse datasets and +significantly outperforms other scene regression approaches on several +benchmarks, including scene-specific models, for visual localization. In +particular, we set a new state of the art on the Cambridge localization +benchmark, even outperforming feature-matching-based approaches. + +
+
+
+
+
+ + ☆ Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts + + +
+ Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have +revolutionized visual representation learning by providing good performance on +downstream datasets. VLMs are 0-shot adapted to a downstream dataset by +designing prompts that are relevant to the dataset. Such prompt engineering +makes use of domain expertise and a validation dataset. Meanwhile, recent +developments in generative pretrained models like GPT-4 mean they can be used +as advanced internet search tools. They can also be manipulated to provide +visual information in any structure. In this work, we show that GPT-4 can be +used to generate text that is visually descriptive and how this can be used to +adapt CLIP to downstream tasks. We show considerable improvements in 0-shot +transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD +(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt. +We also design a simple few-shot adapter that learns to choose the best +possible sentences to construct generalizable classifiers that outperform the +recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized +fine-grained datasets. We will release the code, prompts, and auxiliary text +dataset upon acceptance. + +
+
+ comment: 10 pages, Pre-print +
+
+
+
+
+ + ☆ FEDD -- Fair, Efficient, and Diverse Diffusion-based Lesion Segmentation + and Malignancy Classification + + +
+ Skin diseases affect millions of people worldwide, across all ethnicities. +Increasing diagnosis accessibility requires fair and accurate segmentation and +classification of dermatology images. However, the scarcity of annotated +medical images, especially for rare diseases and underrepresented skin tones, +poses a challenge to the development of fair and accurate models. In this +study, we introduce a Fair, Efficient, and Diverse Diffusion-based framework +for skin lesion segmentation and malignancy classification. FEDD leverages +semantically meaningful feature embeddings learned through a denoising +diffusion probabilistic backbone and processes them via linear probes to +achieve state-of-the-art performance on Diverse Dermatology Images (DDI). We +achieve an improvement in intersection over union of 0.18, 0.13, 0.06, and 0.07 +while using only 5%, 10%, 15%, and 20% labeled samples, respectively. +Additionally, FEDD trained on 10% of DDI demonstrates malignancy classification +accuracy of 81%, 14% higher compared to the state-of-the-art. We showcase high +efficiency in data-constrained scenarios while providing fair performance for +diverse skin tones and rare malignancy conditions. Our newly annotated DDI +segmentation masks and training code can be found on +https://github.com/hectorcarrion/fedd. + +
+
+
+
+
+ + ☆ Morphological Image Analysis and Feature Extraction for Reasoning with + AI-based Defect Detection and Classification Models SC + + +
+ As the use of artificial intelligent (AI) models becomes more prevalent in +industries such as engineering and manufacturing, it is essential that these +models provide transparent reasoning behind their predictions. This paper +proposes the AI-Reasoner, which extracts the morphological characteristics of +defects (DefChars) from images and utilises decision trees to reason with the +DefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e. +charts) and textual explanations to provide insights into outputs made by +masked-based defect detection and classification models. It also provides +effective mitigation strategies to enhance data pre-processing and overall +model performance. The AI-Reasoner was tested on explaining the outputs of an +IE Mask R-CNN model using a set of 366 images containing defects. The results +demonstrated its effectiveness in explaining the IE Mask R-CNN model's +predictions. Overall, the proposed AI-Reasoner provides a solution for +improving the performance of AI models in industrial applications that require +defect analysis. + +
+
+ comment: 8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series + on computational intelligence (SSCI) +
+
+
+
+
+ + ☆ Deep Reinforcement Learning Based System for Intraoperative + Hyperspectral Video Autofocusing MICCAI 2023 + + +
+ Hyperspectral imaging (HSI) captures a greater level of spectral detail than +traditional optical imaging, making it a potentially valuable intraoperative +tool when precise tissue differentiation is essential. Hardware limitations of +current optical systems used for handheld real-time video HSI result in a +limited focal depth, thereby posing usability issues for integration of the +technology into the operating room. This work integrates a focus-tunable liquid +lens into a video HSI exoscope, and proposes novel video autofocusing methods +based on deep reinforcement learning. A first-of-its-kind robotic focal-time +scan was performed to create a realistic and reproducible testing dataset. We +benchmarked our proposed autofocus algorithm against traditional policies, and +found our novel approach to perform significantly ($p<0.05$) better than +traditional techniques ($0.070\pm.098$ mean absolute focal error compared to +$0.146\pm.148$). In addition, we performed a blinded usability trial by having +two neurosurgeons compare the system with different autofocus policies, and +found our novel approach to be the most favourable, making our system a +desirable addition for intraoperative HSI. + +
+
+ comment: To be presented at MICCAI 2023 +
+
+
+
+
+ + ☆ OxfordTVG-HIC: Can Machine Make Humorous Captions from Images? ICCV 2023 + + +
+ This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale +dataset for humour generation and understanding. Humour is an abstract, +subjective, and context-dependent cognitive construct involving several +cognitive factors, making it a challenging task to generate and interpret. +Hence, humour generation and understanding can serve as a new task for +evaluating the ability of deep-learning methods to process abstract and +subjective information. Due to the scarcity of data, humour-related generation +tasks such as captioning remain under-explored. To address this gap, +OxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to +train a generalizable humour captioning model. Contrary to existing captioning +datasets, OxfordTVG-HIC features a wide range of emotional and semantic +diversity resulting in out-of-context examples that are particularly conducive +to generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive +content. We also show how OxfordTVG-HIC can be leveraged for evaluating the +humour of a generated text. Through explainability analysis of the trained +models, we identify the visual and linguistic cues influential for evoking +humour prediction (and generation). We observe qualitatively that these cues +are aligned with the benign violation theory of humour in cognitive psychology. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Divide and Adapt: Active Domain Adaptation via Customized Learning CVPR2023 + + +
+ Active domain adaptation (ADA) aims to improve the model adaptation +performance by incorporating active learning (AL) techniques to label a +maximally-informative subset of target samples. Conventional AL methods do not +consider the existence of domain shift, and hence, fail to identify the truly +valuable samples in the context of domain adaptation. To accommodate active +learning and domain adaption, the two naturally different tasks, in a +collaborative framework, we advocate that a customized learning strategy for +the target data is the key to the success of ADA solutions. We present +Divide-and-Adapt (DiaNA), a new ADA framework that partitions the target +instances into four categories with stratified transferable properties. With a +novel data subdivision protocol based on uncertainty and domainness, DiaNA can +accurately recognize the most gainful samples. While sending the informative +instances for annotation, DiaNA employs tailored learning strategies for the +remaining categories. Furthermore, we propose an informativeness score that +unifies the data partitioning criteria. This enables the use of a Gaussian +mixture model (GMM) to automatically sample unlabeled data into the proposed +four categories. Thanks to the "divideand-adapt" spirit, DiaNA can handle data +with large variations of domain gap. In addition, we show that DiaNA can +generalize to different domain adaptation settings, such as unsupervised domain +adaptation (UDA), semi-supervised domain adaptation (SSDA), source-free domain +adaptation (SFDA), etc. + +
+
+ comment: CVPR2023, Highlight paper +
+
+
+
+
+ + ☆ Consistency-guided Meta-Learning for Bootstrapping Semi-Supervised + Medical Image Segmentation MICCAI 2023 + + +
+ Medical imaging has witnessed remarkable progress but usually requires a +large amount of high-quality annotated data which is time-consuming and costly +to obtain. To alleviate this burden, semi-supervised learning has garnered +attention as a potential solution. In this paper, we present Meta-Learning for +Bootstrapping Medical Image Segmentation (MLB-Seg), a novel method for tackling +the challenge of semi-supervised medical image segmentation. Specifically, our +approach first involves training a segmentation model on a small set of clean +labeled images to generate initial labels for unlabeled data. To further +optimize this bootstrapping process, we introduce a per-pixel weight mapping +system that dynamically assigns weights to both the initialized labels and the +model's own predictions. These weights are determined using a meta-process that +prioritizes pixels with loss gradient directions closer to those of clean data, +which is based on a small set of precisely annotated images. To facilitate the +meta-learning process, we additionally introduce a consistency-based Pseudo +Label Enhancement (PLE) scheme that improves the quality of the model's own +predictions by ensembling predictions from various augmented versions of the +same input. In order to improve the quality of the weight maps obtained through +multiple augmentations of a single input, we introduce a mean teacher into the +PLE scheme. This method helps to reduce noise in the weight maps and stabilize +its generation process. Our extensive experimental results on public atrial and +prostate segmentation datasets demonstrate that our proposed method achieves +state-of-the-art results under semi-supervision. Our code is available at +https://github.com/aijinrjinr/MLB-Seg. + +
+
+ comment: Accepted to MICCAI 2023. Code is publicly available at + https://github.com/aijinrjinr/MLB-Seg +
+
+
+
+
+ + ☆ Cascaded multitask U-Net using topological loss for vessel segmentation + and centerline extraction + + +
+ Vessel segmentation and centerline extraction are two crucial preliminary +tasks for many computer-aided diagnosis tools dealing with vascular diseases. +Recently, deep-learning based methods have been widely applied to these tasks. +However, classic deep-learning approaches struggle to capture the complex +geometry and specific topology of vascular networks, which is of the utmost +importance in most applications. To overcome these limitations, the clDice +loss, a topological loss that focuses on the vessel centerlines, has been +recently proposed. This loss requires computing, with a proposed soft-skeleton +algorithm, the skeletons of both the ground truth and the predicted +segmentation. However, the soft-skeleton algorithm provides suboptimal results +on 3D images, which makes the clDice hardly suitable on 3D images. In this +paper, we propose to replace the soft-skeleton algorithm by a U-Net which +computes the vascular skeleton directly from the segmentation. We show that our +method provides more accurate skeletons than the soft-skeleton algorithm. We +then build upon this network a cascaded U-Net trained with the clDice loss to +embed topological constraints during the segmentation. The resulting model is +able to predict both the vessel segmentation and centerlines with a more +accurate topology. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ CortexMorph: fast cortical thickness estimation via diffeomorphic + registration using VoxelMorph MICCAI 2023 + + +
+ The thickness of the cortical band is linked to various neurological and +psychiatric conditions, and is often estimated through surface-based methods +such as Freesurfer in MRI studies. The DiReCT method, which calculates cortical +thickness using a diffeomorphic deformation of the gray-white matter interface +towards the pial surface, offers an alternative to surface-based methods. +Recent studies using a synthetic cortical thickness phantom have demonstrated +that the combination of DiReCT and deep-learning-based segmentation is more +sensitive to subvoxel cortical thinning than Freesurfer. + While anatomical segmentation of a T1-weighted image now takes seconds, +existing implementations of DiReCT rely on iterative image registration methods +which can take up to an hour per volume. On the other hand, learning-based +deformable image registration methods like VoxelMorph have been shown to be +faster than classical methods while improving registration accuracy. This paper +proposes CortexMorph, a new method that employs unsupervised deep learning to +directly regress the deformation field needed for DiReCT. By combining +CortexMorph with a deep-learning-based segmentation model, it is possible to +estimate region-wise thickness in seconds from a T1-weighted image, while +maintaining the ability to detect cortical atrophy. We validate this claim on +the OASIS-3 dataset and the synthetic cortical thickness phantom of Rusak et +al. + +
+
+ comment: Accepted (early acceptance) at MICCAI 2023 +
+
+
+
+
+ + ☆ Advancing Visual Grounding with Scene Knowledge: Benchmark and Method CVPR-2023 + + +
+ Visual grounding (VG) aims to establish fine-grained alignment between vision +and language. Ideally, it can be a testbed for vision-and-language models to +evaluate their understanding of the images and texts and their reasoning +abilities over their joint space. However, most existing VG datasets are +constructed using simple description texts, which do not require sufficient +reasoning over the images and texts. This has been demonstrated in a recent +study~\cite{luo2022goes}, where a simple LSTM-based text encoder without +pretraining can achieve state-of-the-art performance on mainstream VG datasets. +Therefore, in this paper, we propose a novel benchmark of \underline{S}cene +\underline{K}nowledge-guided \underline{V}isual \underline{G}rounding (SK-VG), +where the image content and referring expressions are not sufficient to ground +the target objects, forcing the models to have a reasoning ability on the +long-form scene knowledge. To perform this task, we propose two approaches to +accept the triple-type input, where the former embeds knowledge into the image +features before the image-query interaction; the latter leverages linguistic +structure to assist in computing the image-text matching. We conduct extensive +experiments to analyze the above methods and show that the proposed approaches +achieve promising results but still leave room for improvement, including +performance and interpretability. The dataset and code are available at +\url{https://github.com/zhjohnchan/SK-VG}. + +
+
+ comment: Computer Vision and Natural Language Processing. 21 pages, 14 + figures. CVPR-2023 +
+
+
+
+
+ + ☆ YOLOPose V2: Understanding and Improving Transformer-based 6D Pose + Estimation + + +
+ 6D object pose estimation is a crucial prerequisite for autonomous robot +manipulation applications. The state-of-the-art models for pose estimation are +convolutional neural network (CNN)-based. Lately, Transformers, an architecture +originally proposed for natural language processing, is achieving +state-of-the-art results in many computer vision tasks as well. Equipped with +the multi-head self-attention mechanism, Transformers enable simple +single-stage end-to-end architectures for learning object detection and 6D +object pose estimation jointly. In this work, we propose YOLOPose (short form +for You Only Look Once Pose estimation), a Transformer-based multi-object 6D +pose estimation method based on keypoint regression and an improved variant of +the YOLOPose model. In contrast to the standard heatmaps for predicting +keypoints in an image, we directly regress the keypoints. Additionally, we +employ a learnable orientation estimation module to predict the orientation +from the keypoints. Along with a separate translation estimation module, our +model is end-to-end differentiable. Our method is suitable for real-time +applications and achieves results comparable to state-of-the-art methods. We +analyze the role of object queries in our architecture and reveal that the +object queries specialize in detecting objects in specific image regions. +Furthermore, we quantify the accuracy trade-off of using datasets of smaller +sizes to train our model. + +
+
+ comment: Robotics and Autonomous Systems Journal, Elsevier, to appear 2023. + arXiv admin note: substantial text overlap with arXiv:2205.02536 +
+
+
+
+
+ + ☆ Bridging Vision and Language Encoders: Parameter-Efficient Tuning for + Referring Image Segmentation ICCV-2023 + + +
+ Parameter Efficient Tuning (PET) has gained attention for reducing the number +of parameters while maintaining performance and providing better hardware +resource savings, but few studies investigate dense prediction tasks and +interaction between modalities. In this paper, we do an investigation of +efficient tuning problems on referring image segmentation. We propose a novel +adapter called Bridger to facilitate cross-modal information exchange and +inject task-specific information into the pre-trained model. We also design a +lightweight decoder for image segmentation. Our approach achieves comparable or +superior performance with only 1.61\% to 3.38\% backbone parameter updates, +evaluated on challenging benchmarks. The code is available at +\url{https://github.com/kkakkkka/ETRIS}. + +
+
+ comment: Computer Vision and Natural Language Processing. 14 pages, 8 figures. + ICCV-2023 +
+
+
+
+
+ + ☆ KVN: Keypoints Voting Network with Differentiable RANSAC for Stereo Pose + Estimation + + +
+ Object pose estimation is a fundamental computer vision task exploited in +several robotics and augmented reality applications. Many established +approaches rely on predicting 2D-3D keypoint correspondences using RANSAC +(Random sample consensus) and estimating the object pose using the PnP +(Perspective-n-Point) algorithm. Being RANSAC non-differentiable, +correspondences cannot be directly learned in an end-to-end fashion. In this +paper, we address the stereo image-based object pose estimation problem by (i) +introducing a differentiable RANSAC layer into a well-known monocular pose +estimation network; (ii) exploiting an uncertainty-driven multi-view PnP solver +which can fuse information from multiple views. We evaluate our approach on a +challenging public stereo object pose estimation dataset, yielding +state-of-the-art results against other recent approaches. Furthermore, in our +ablation study, we show that the differentiable RANSAC layer plays a +significant role in the accuracy of the proposed method. We release with this +paper the open-source implementation of our method. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle + Transformation Multi-scale GAN + + +
+ Fundus photography is an essential examination for clinical and differential +diagnosis of fundus diseases. Recently, Ultra-Wide-angle Fundus (UWF) +techniques, UWF Fluorescein Angiography (UWF-FA) and UWF Scanning Laser +Ophthalmoscopy (UWF-SLO) have been gradually put into use. However, Fluorescein +Angiography (FA) and UWF-FA require injecting sodium fluorescein which may have +detrimental influences. To avoid negative impacts, cross-modality medical image +generation algorithms have been proposed. Nevertheless, current methods in +fundus imaging could not produce high-resolution images and are unable to +capture tiny vascular lesion areas. This paper proposes a novel conditional +generative adversarial network (UWAT-GAN) to synthesize UWF-FA from UWF-SLO. +Using multi-scale generators and a fusion module patch to better extract global +and local information, our model can generate high-resolution images. Moreover, +an attention transmit module is proposed to help the decoder learn effectively. +Besides, a supervised approach is used to train the network using multiple new +weighted losses on different scales of data. Experiments on an in-house UWF +image dataset demonstrate the superiority of the UWAT-GAN over the +state-of-the-art methods. The source code is available at: +https://github.com/Tinysqua/UWAT-GAN. + +
+
+ comment: 26th International Conference on Medical Image Computing and Computer + Assisted Intervention +
+
+
+
+
+ + ☆ Improving Viewpoint Robustness for Visual Recognition via Adversarial + Training + + +
+ Viewpoint invariance remains challenging for visual recognition in the 3D +world, as altering the viewing directions can significantly impact predictions +for the same object. While substantial efforts have been dedicated to making +neural networks invariant to 2D image translations and rotations, viewpoint +invariance is rarely investigated. Motivated by the success of adversarial +training in enhancing model robustness, we propose Viewpoint-Invariant +Adversarial Training (VIAT) to improve the viewpoint robustness of image +classifiers. Regarding viewpoint transformation as an attack, we formulate VIAT +as a minimax optimization problem, where the inner maximization characterizes +diverse adversarial viewpoints by learning a Gaussian mixture distribution +based on the proposed attack method GMVFool. The outer minimization obtains a +viewpoint-invariant classifier by minimizing the expected loss over the +worst-case viewpoint distributions that can share the same one for different +objects within the same category. Based on GMVFool, we contribute a large-scale +dataset called ImageNet-V+ to benchmark viewpoint robustness. Experimental +results show that VIAT significantly improves the viewpoint robustness of +various image classifiers based on the diversity of adversarial viewpoints +generated by GMVFool. Furthermore, we propose ViewRS, a certified viewpoint +robustness method that provides a certified radius and accuracy to demonstrate +the effectiveness of VIAT from the theoretical perspective. + +
+
+ comment: 14 pages, 12 figures. arXiv admin note: substantial text overlap with + arXiv:2307.10235 +
+
+
+
+
+ + ☆ CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields + + +
+ Neural Radiance Fields (NeRF) have the potential to be a major representation +of media. Since training a NeRF has never been an easy task, the protection of +its model copyright should be a priority. In this paper, by analyzing the pros +and cons of possible copyright protection solutions, we propose to protect the +copyright of NeRF models by replacing the original color representation in NeRF +with a watermarked color representation. Then, a distortion-resistant rendering +scheme is designed to guarantee robust message extraction in 2D renderings of +NeRF. Our proposed method can directly protect the copyright of NeRF models +while maintaining high rendering quality and bit accuracy when compared among +optional solutions. + +
+
+ comment: 11 pages, 6 figures, accepted by iccv 2023 non-camera-ready version +
+
+
+
+
+ + ☆ BatMobility: Towards Flying Without Seeing for Autonomous Drones + + +
+ Unmanned aerial vehicles (UAVs) rely on optical sensors such as cameras and +lidar for autonomous operation. However, such optical sensors are error-prone +in bad lighting, inclement weather conditions including fog and smoke, and +around textureless or transparent surfaces. In this paper, we ask: is it +possible to fly UAVs without relying on optical sensors, i.e., can UAVs fly +without seeing? We present BatMobility, a lightweight mmWave radar-only +perception system for UAVs that eliminates the need for optical sensors. +BatMobility enables two core functionalities for UAVs -- radio flow estimation +(a novel FMCW radar-based alternative for optical flow based on +surface-parallel doppler shift) and radar-based collision avoidance. We build +BatMobility using commodity sensors and deploy it as a real-time system on a +small off-the-shelf quadcopter running an unmodified flight controller. Our +evaluation shows that BatMobility achieves comparable or better performance +than commercial-grade optical sensors across a wide range of scenarios. + +
+
+
+
+
+ + ☆ CORE: Cooperative Reconstruction for Multi-Agent Perception + + +
+ This paper presents CORE, a conceptually simple, effective and +communication-efficient model for multi-agent cooperative perception. It +addresses the task from a novel perspective of cooperative reconstruction, +based on two key insights: 1) cooperating agents together provide a more +holistic observation of the environment, and 2) the holistic observation can +serve as valuable supervision to explicitly guide the model learning how to +reconstruct the ideal observation based on collaboration. CORE instantiates the +idea with three major components: a compressor for each agent to create more +compact feature representation for efficient broadcasting, a lightweight +attentive collaboration component for cross-agent message aggregation, and a +reconstruction module to reconstruct the observation based on aggregated +feature representations. This learning-to-reconstruct idea is task-agnostic, +and offers clear and reasonable supervision to inspire more effective +collaboration, eventually promoting perception tasks. We validate CORE on +OPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D +object detection and semantic segmentation. Results demonstrate that the model +achieves state-of-the-art performance on both tasks, and is more +communication-efficient. + +
+
+
+
+
+ + ☆ Bone mineral density estimation from a plain X-ray image by learning + decomposition into projections of bone-segmented computed tomography + + +
+ Osteoporosis is a prevalent bone disease that causes fractures in fragile +bones, leading to a decline in daily living activities. Dual-energy X-ray +absorptiometry (DXA) and quantitative computed tomography (QCT) are highly +accurate for diagnosing osteoporosis; however, these modalities require special +equipment and scan protocols. To frequently monitor bone health, low-cost, +low-dose, and ubiquitously available diagnostic methods are highly anticipated. +In this study, we aim to perform bone mineral density (BMD) estimation from a +plain X-ray image for opportunistic screening, which is potentially useful for +early diagnosis. Existing methods have used multi-stage approaches consisting +of extraction of the region of interest and simple regression to estimate BMD, +which require a large amount of training data. Therefore, we propose an +efficient method that learns decomposition into projections of bone-segmented +QCT for BMD estimation under limited datasets. The proposed method achieved +high accuracy in BMD estimation, where Pearson correlation coefficients of +0.880 and 0.920 were observed for DXA-measured BMD and QCT-measured BMD +estimation tasks, respectively, and the root mean square of the coefficient of +variation values were 3.27 to 3.79% for four measurements with different poses. +Furthermore, we conducted extensive validation experiments, including +multi-pose, uncalibrated-CT, and compression experiments toward actual +application in routine clinical practice. + +
+
+ comment: 20 pages and 22 figures +
+
+
+
+
+ + ☆ Redemption from Range-view for Accurate 3D Object Detection + + +
+ Most recent approaches for 3D object detection predominantly rely on +point-view or bird's-eye view representations, with limited exploration of +range-view-based methods. The range-view representation suffers from scale +variation and surface texture deficiency, both of which pose significant +limitations for developing corresponding methods. Notably, the surface texture +loss problem has been largely ignored by all existing methods, despite its +significant impact on the accuracy of range-view-based 3D object detection. In +this study, we propose Redemption from Range-view R-CNN (R2 R-CNN), a novel and +accurate approach that comprehensively explores the range-view representation. +Our proposed method addresses scale variation through the HD Meta Kernel, which +captures range-view geometry information in multiple scales. Additionally, we +introduce Feature Points Redemption (FPR) to recover the lost 3D surface +texture information from the range view, and Synchronous-Grid RoI Pooling +(S-Grid RoI Pooling), a multi-scaled approach with multiple receptive fields +for accurate box refinement. Our R2 R-CNN outperforms existing range-view-based +methods, achieving state-of-the-art performance on both the KITTI benchmark and +the Waymo Open Dataset. Our study highlights the critical importance of +addressing the surface texture loss problem for accurate 3D object detection in +range-view-based methods. Codes will be made publicly available. + +
+
+
+
+
+ + ☆ SA-BEV: Generating Semantic-Aware Bird's-Eye-View Feature for Multi-view + 3D Object Detection + + +
+ Recently, the pure camera-based Bird's-Eye-View (BEV) perception provides a +feasible solution for economical autonomous driving. However, the existing +BEV-based multi-view 3D detectors generally transform all image features into +BEV features, without considering the problem that the large proportion of +background information may submerge the object information. In this paper, we +propose Semantic-Aware BEV Pooling (SA-BEVPool), which can filter out +background information according to the semantic segmentation of image features +and transform image features into semantic-aware BEV features. Accordingly, we +propose BEV-Paste, an effective data augmentation strategy that closely matches +with semantic-aware BEV feature. In addition, we design a Multi-Scale +Cross-Task (MSCT) head, which combines task-specific and cross-task information +to predict depth distribution and semantic segmentation more accurately, +further improving the quality of semantic-aware BEV feature. Finally, we +integrate the above modules into a novel multi-view 3D object detection +framework, namely SA-BEV. Experiments on nuScenes show that SA-BEV achieves +state-of-the-art performance. Code has been available at +https://github.com/mengtan00/SA-BEV.git. + +
+
+
+
+
+ + ☆ Robust Visual Question Answering: Datasets, Methods, and Future + Challenges + + +
+ Visual question answering requires a system to provide an accurate natural +language answer given an image and a natural language question. However, it is +widely recognized that previous generic VQA methods often exhibit a tendency to +memorize biases present in the training data rather than learning proper +behaviors, such as grounding images before predicting answers. Therefore, these +methods usually achieve high in-distribution but poor out-of-distribution +performance. In recent years, various datasets and debiasing methods have been +proposed to evaluate and enhance the VQA robustness, respectively. This paper +provides the first comprehensive survey focused on this emerging fashion. +Specifically, we first provide an overview of the development process of +datasets from in-distribution and out-of-distribution perspectives. Then, we +examine the evaluation metrics employed by these datasets. Thirdly, we propose +a typology that presents the development process, similarities and differences, +robustness comparison, and technical features of existing debiasing methods. +Furthermore, we analyze and discuss the robustness of representative +vision-and-language pre-training models on VQA. Finally, through a thorough +review of the available literature and experimental analysis, we discuss the +key areas for future research from various viewpoints. + +
+
+ comment: IEEE TPAMI (Under Review) +
+
+
+
+
+ + ☆ Physics-Aware Semi-Supervised Underwater Image Enhancement + + +
+ Underwater images normally suffer from degradation due to the transmission +medium of water bodies. Both traditional prior-based approaches and deep +learning-based methods have been used to address this problem. However, the +inflexible assumption of the former often impairs their effectiveness in +handling diverse underwater scenes, while the generalization of the latter to +unseen images is usually weakened by insufficient data. In this study, we +leverage both the physics-based underwater Image Formation Model (IFM) and deep +learning techniques for Underwater Image Enhancement (UIE). To this end, we +propose a novel Physics-Aware Dual-Stream Underwater Image Enhancement Network, +i.e., PA-UIENet, which comprises a Transmission Estimation Steam (T-Stream) and +an Ambient Light Estimation Stream (A-Stream). This network fulfills the UIE +task by explicitly estimating the degradation parameters of the IFM. We also +adopt an IFM-inspired semi-supervised learning framework, which exploits both +the labeled and unlabeled images, to address the issue of insufficient data. +Our method performs better than, or at least comparably to, eight baselines +across five testing sets in the degradation estimation and UIE tasks. This +should be due to the fact that it not only can model the degradation but also +can learn the characteristics of diverse underwater scenes. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Distribution Shift Matters for Knowledge Distillation with Webly + Collected Images + + +
+ Knowledge distillation aims to learn a lightweight student network from a +pre-trained teacher network. In practice, existing knowledge distillation +methods are usually infeasible when the original training data is unavailable +due to some privacy issues and data management considerations. Therefore, +data-free knowledge distillation approaches proposed to collect training +instances from the Internet. However, most of them have ignored the common +distribution shift between the instances from original training data and webly +collected data, affecting the reliability of the trained student network. To +solve this problem, we propose a novel method dubbed ``Knowledge Distillation +between Different Distributions" (KD$^{3}$), which consists of three +components. Specifically, we first dynamically select useful training instances +from the webly collected data according to the combined predictions of teacher +network and student network. Subsequently, we align both the weighted features +and classifier parameters of the two networks for knowledge memorization. +Meanwhile, we also build a new contrastive learning block called +MixDistribution to generate perturbed data with a new distribution for instance +alignment, so that the student network can further learn a +distribution-invariant representation. Intensive experiments on various +benchmark datasets demonstrate that our proposed KD$^{3}$ can outperform the +state-of-the-art data-free knowledge distillation approaches. + +
+
+
+
+
+ + ☆ MatSpectNet: Material Segmentation Network with Domain-Aware and + Physically-Constrained Hyperspectral Reconstruction + + +
+ Achieving accurate material segmentation for 3-channel RGB images is +challenging due to the considerable variation in a material's appearance. +Hyperspectral images, which are sets of spectral measurements sampled at +multiple wavelengths, theoretically offer distinct information for material +identification, as variations in intensity of electromagnetic radiation +reflected by a surface depend on the material composition of a scene. However, +existing hyperspectral datasets are impoverished regarding the number of images +and material categories for the dense material segmentation task, and +collecting and annotating hyperspectral images with a spectral camera is +prohibitively expensive. To address this, we propose a new model, the +MatSpectNet to segment materials with recovered hyperspectral images from RGB +images. The network leverages the principles of colour perception in modern +cameras to constrain the reconstructed hyperspectral images and employs the +domain adaptation method to generalise the hyperspectral reconstruction +capability from a spectral recovery dataset to material segmentation datasets. +The reconstructed hyperspectral images are further filtered using learned +response curves and enhanced with human perception. The performance of +MatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces +dataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase +in average pixel accuracy and a 3.42% improvement in mean class accuracy +compared with the most recent publication. The project code is attached to the +supplementary material and will be published on GitHub. + +
+
+ comment: 7 pages main content +
+
+
+
+
+ + ☆ Strip-MLP: Efficient Token Interaction for Vision MLP + + +
+ Token interaction operation is one of the core modules in MLP-based models to +exchange and aggregate information between different spatial locations. +However, the power of token interaction on the spatial dimension is highly +dependent on the spatial resolution of the feature maps, which limits the +model's expressive ability, especially in deep layers where the feature are +down-sampled to a small spatial size. To address this issue, we present a novel +method called \textbf{Strip-MLP} to enrich the token interaction power in three +ways. Firstly, we introduce a new MLP paradigm called Strip MLP layer that +allows the token to interact with other tokens in a cross-strip manner, +enabling the tokens in a row (or column) to contribute to the information +aggregations in adjacent but different strips of rows (or columns). Secondly, a +\textbf{C}ascade \textbf{G}roup \textbf{S}trip \textbf{M}ixing \textbf{M}odule +(CGSMM) is proposed to overcome the performance degradation caused by small +spatial feature size. The module allows tokens to interact more effectively in +the manners of within-patch and cross-patch, which is independent to the +feature spatial size. Finally, based on the Strip MLP layer, we propose a novel +\textbf{L}ocal \textbf{S}trip \textbf{M}ixing \textbf{M}odule (LSMM) to boost +the token interaction power in the local region. Extensive experiments +demonstrate that Strip-MLP significantly improves the performance of MLP-based +models on small datasets and obtains comparable or even better results on +ImageNet. In particular, Strip-MLP models achieve higher average Top-1 accuracy +than existing MLP-based models by +2.44\% on Caltech-101 and +2.16\% on +CIFAR-100. The source codes will be available +at~\href{https://github.com/Med-Process/Strip_MLP{https://github.com/Med-Process/Strip\_MLP}. + +
+
+
+
+
+ + ☆ Attention Consistency Refined Masked Frequency Forgery Representation + for Generalizing Face Forgery Detection + + +
+ Due to the successful development of deep image generation technology, visual +data forgery detection would play a more important role in social and economic +security. Existing forgery detection methods suffer from unsatisfactory +generalization ability to determine the authenticity in the unseen domain. In +this paper, we propose a novel Attention Consistency Refined masked frequency +forgery representation model toward generalizing face forgery detection +algorithm (ACMF). Most forgery technologies always bring in high-frequency +aware cues, which make it easy to distinguish source authenticity but difficult +to generalize to unseen artifact types. The masked frequency forgery +representation module is designed to explore robust forgery cues by randomly +discarding high-frequency information. In addition, we find that the forgery +attention map inconsistency through the detection network could affect the +generalizability. Thus, the forgery attention consistency is introduced to +force detectors to focus on similar attention regions for better generalization +ability. Experiment results on several public face forgery datasets +(FaceForensic++, DFD, Celeb-DF, and WDF datasets) demonstrate the superior +performance of the proposed method compared with the state-of-the-art methods. + +
+
+ comment: The source code and models are publicly available at + https://github.com/chenboluo/ACMF +
+
+
+
+
+ + ☆ Batching for Green AI -- An Exploratory Study on Inference + + +
+ The batch size is an essential parameter to tune during the development of +new neural networks. Amongst other quality indicators, it has a large degree of +influence on the model's accuracy, generalisability, training times and +parallelisability. This fact is generally known and commonly studied. However, +during the application phase of a deep learning model, when the model is +utilised by an end-user for inference, we find that there is a disregard for +the potential benefits of introducing a batch size. In this study, we examine +the effect of input batching on the energy consumption and response times of +five fully-trained neural networks for computer vision that were considered +state-of-the-art at the time of their publication. The results suggest that +batching has a significant effect on both of these metrics. Furthermore, we +present a timeline of the energy efficiency and accuracy of neural networks +over the past decade. We find that in general, energy consumption rises at a +much steeper pace than accuracy and question the necessity of this evolution. +Additionally, we highlight one particular network, ShuffleNetV2(2018), that +achieved a competitive performance for its time while maintaining a much lower +energy consumption. Nevertheless, we highlight that the results are model +dependent. + +
+
+ comment: 8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series + on Software Engineering and Advanced Applications (SEAA) 2023 +
+
+
+
+
+ + ☆ FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural + Radiance Fields ICCV 2023 + + +
+ As recent advances in Neural Radiance Fields (NeRF) have enabled +high-fidelity 3D face reconstruction and novel view synthesis, its manipulation +also became an essential task in 3D vision. However, existing manipulation +methods require extensive human labor, such as a user-provided semantic mask +and manual attribute search unsuitable for non-expert users. Instead, our +approach is designed to require a single text to manipulate a face +reconstructed with NeRF. To do so, we first train a scene manipulator, a latent +code-conditional deformable NeRF, over a dynamic scene to control a face +deformation using the latent code. However, representing a scene deformation +with a single latent code is unfavorable for compositing local deformations +observed in different instances. As so, our proposed Position-conditional +Anchor Compositor (PAC) learns to represent a manipulated scene with spatially +varying latent codes. Their renderings with the scene manipulator are then +optimized to yield high cosine similarity to a target text in CLIP embedding +space for text-driven manipulation. To the best of our knowledge, our approach +is the first to address the text-driven manipulation of a face reconstructed +with NeRF. Extensive results, comparisons, and ablation studies demonstrate the +effectiveness of our approach. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ A Video-based Detector for Suspicious Activity in Examination with + OpenPose + + +
+ Examinations are a crucial part of the learning process, and academic +institutions invest significant resources into maintaining their integrity by +preventing cheating from students or facilitators. However, cheating has become +rampant in examination setups, compromising their integrity. The traditional +method of relying on invigilators to monitor every student is impractical and +ineffective. To address this issue, there is a need to continuously record exam +sessions to monitor students for suspicious activities. However, these +recordings are often too lengthy for invigilators to analyze effectively, and +fatigue may cause them to miss significant details. To widen the coverage, +invigilators could use fixed overhead or wearable cameras. This paper +introduces a framework that uses automation to analyze videos and detect +suspicious activities during examinations efficiently and effectively. We +utilized the OpenPose framework and Convolutional Neural Network (CNN) to +identify students exchanging objects during exams. This detection system is +vital in preventing cheating and promoting academic integrity, fairness, and +quality education for institutions. + +
+
+
+
+
+ + ☆ Deep Directly-Trained Spiking Neural Networks for Object Detection ICCV2023 + + +
+ Spiking neural networks (SNNs) are brain-inspired energy-efficient models +that encode information in spatiotemporal dynamics. Recently, deep SNNs trained +directly have shown great success in achieving high performance on +classification tasks with very few time steps. However, how to design a +directly-trained SNN for the regression task of object detection still remains +a challenging problem. To address this problem, we propose EMS-YOLO, a novel +directly-trained SNN framework for object detection, which is the first trial +to train a deep SNN with surrogate gradients for object detection rather than +ANN-SNN conversion strategies. Specifically, we design a full-spike residual +block, EMS-ResNet, which can effectively extend the depth of the +directly-trained SNN with low power consumption. Furthermore, we theoretically +analyze and prove the EMS-ResNet could avoid gradient vanishing or exploding. +The results demonstrate that our approach outperforms the state-of-the-art +ANN-SNN conversion methods (at least 500 time steps) in extremely fewer time +steps (only 4 time steps). It is shown that our model could achieve comparable +performance to the ANN with the same architecture while consuming 5.83 times +less energy on the frame-based COCO Dataset and the event-based Gen1 Dataset. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Subject-Diffusion:Open Domain Personalized Text-to-Image Generation + without Test-time Fine-tuning + + +
+ Recent progress in personalized image generation using diffusion models has +been significant. However, development in the area of open-domain and +non-fine-tuning personalized image generation is proceeding rather slowly. In +this paper, we propose Subject-Diffusion, a novel open-domain personalized +image generation model that, in addition to not requiring test-time +fine-tuning, also only requires a single reference image to support +personalized generation of single- or multi-subject in any domain. Firstly, we +construct an automatic data labeling tool and use the LAION-Aesthetics dataset +to construct a large-scale dataset consisting of 76M images and their +corresponding subject detection bounding boxes, segmentation masks and text +descriptions. Secondly, we design a new unified framework that combines text +and image semantics by incorporating coarse location and fine-grained reference +image control to maximize subject fidelity and generalization. Furthermore, we +also adopt an attention control mechanism to support multi-subject generation. +Extensive qualitative and quantitative results demonstrate that our method +outperforms other SOTA frameworks in single, multiple, and human customized +image generation. Please refer to our +\href{https://oppo-mente-lab.github.io/subject_diffusion/}{project page} + +
+
+ comment: 14 pages, 10 figures +
+
+
+
+
+ + ☆ Latent-OFER: Detect, Mask, and Reconstruct with Latent Vectors for + Occluded Facial Expression Recognition + + +
+ Most research on facial expression recognition (FER) is conducted in highly +controlled environments, but its performance is often unacceptable when applied +to real-world situations. This is because when unexpected objects occlude the +face, the FER network faces difficulties extracting facial features and +accurately predicting facial expressions. Therefore, occluded FER (OFER) is a +challenging problem. Previous studies on occlusion-aware FER have typically +required fully annotated facial images for training. However, collecting facial +images with various occlusions and expression annotations is time-consuming and +expensive. Latent-OFER, the proposed method, can detect occlusions, restore +occluded parts of the face as if they were unoccluded, and recognize them, +improving FER accuracy. This approach involves three steps: First, the vision +transformer (ViT)-based occlusion patch detector masks the occluded position by +training only latent vectors from the unoccluded patches using the support +vector data description algorithm. Second, the hybrid reconstruction network +generates the masking position as a complete image using the ViT and +convolutional neural network (CNN). Last, the expression-relevant latent vector +extractor retrieves and uses expression-related information from all latent +vectors by applying a CNN-based class activation map. This mechanism has a +significant advantage in preventing performance degradation from occlusion by +unseen objects. The experimental results on several databases demonstrate the +superiority of the proposed method over state-of-the-art methods. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Probabilistic Modeling of Inter- and Intra-observer Variability in + Medical Image Segmentation + + +
+ Medical image segmentation is a challenging task, particularly due to inter- +and intra-observer variability, even between medical experts. In this paper, we +propose a novel model, called Probabilistic Inter-Observer and iNtra-Observer +variation NetwOrk (Pionono). It captures the labeling behavior of each rater +with a multidimensional probability distribution and integrates this +information with the feature maps of the image to produce probabilistic +segmentation predictions. The model is optimized by variational inference and +can be trained end-to-end. It outperforms state-of-the-art models such as +STAPLE, Probabilistic U-Net, and models based on confusion matrices. +Additionally, Pionono predicts multiple coherent segmentation maps that mimic +the rater's expert opinion, which provides additional valuable information for +the diagnostic process. Experiments on real-world cancer segmentation datasets +demonstrate the high accuracy and efficiency of Pionono, making it a powerful +tool for medical image analysis. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ CLR: Channel-wise Lightweight Reprogramming for Continual Learning ICCV 2023 + + +
+ Continual learning aims to emulate the human ability to continually +accumulate knowledge over sequential tasks. The main challenge is to maintain +performance on previously learned tasks after learning new tasks, i.e., to +avoid catastrophic forgetting. We propose a Channel-wise Lightweight +Reprogramming (CLR) approach that helps convolutional neural networks (CNNs) +overcome catastrophic forgetting during continual learning. We show that a CNN +model trained on an old task (or self-supervised proxy task) could be +``reprogrammed" to solve a new task by using our proposed lightweight (very +cheap) reprogramming parameter. With the help of CLR, we have a better +stability-plasticity trade-off to solve continual learning problems: To +maintain stability and retain previous task ability, we use a common +task-agnostic immutable part as the shared ``anchor" parameter set. We then add +task-specific lightweight reprogramming parameters to reinterpret the outputs +of the immutable parts, to enable plasticity and integrate new knowledge. To +learn sequential tasks, we only train the lightweight reprogramming parameters +to learn each new task. Reprogramming parameters are task-specific and +exclusive to each task, which makes our method immune to catastrophic +forgetting. To minimize the parameter requirement of reprogramming to learn new +tasks, we make reprogramming lightweight by only adjusting essential kernels +and learning channel-wise linear mappings from anchor parameters to +task-specific domain knowledge. We show that, for general CNNs, the CLR +parameter increase is less than 0.6\% for any new task. Our method outperforms +13 state-of-the-art continual learning baselines on a new challenging sequence +of 53 image classification datasets. Code and data are available at +https://github.com/gyhandy/Channel-wise-Lightweight-Reprogramming + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent + Space + + +
+ Data Augmentation (DA) is a technique to increase the quantity and diversity +of the training data, and by that alleviate overfitting and improve +generalisation. However, standard DA produces synthetic data for augmentation +with limited diversity. Generative Adversarial Networks (GANs) may unlock +additional information in a dataset by generating synthetic samples having the +appearance of real images. However, these models struggle to simultaneously +address three key requirements: fidelity and high-quality samples; diversity +and mode coverage; and fast sampling. Indeed, GANs generate high-quality +samples rapidly, but have poor mode coverage, limiting their adoption in DA +applications. We propose LatentAugment, a DA strategy that overcomes the low +diversity of GANs, opening up for use in DA applications. Without external +supervision, LatentAugment modifies latent vectors and moves them into latent +space regions to maximise the synthetic images' diversity and fidelity. It is +also agnostic to the dataset and the downstream task. A wide set of experiments +shows that LatentAugment improves the generalisation of a deep model +translating from MRI-to-CT beating both standard DA as well GAN-based sampling. +Moreover, still in comparison with GAN-based sampling, LatentAugment synthetic +samples show superior mode coverage and diversity. Code is available at: +https://github.com/ltronchin/LatentAugment. + +
+
+
+
+
+ + ☆ Photo2Relief: Let Human in the Photograph Stand Out + + +
+ In this paper, we propose a technique for making humans in photographs +protrude like reliefs. Unlike previous methods which mostly focus on the face +and head, our method aims to generate art works that describe the whole body +activity of the character. One challenge is that there is no ground-truth for +supervised deep learning. We introduce a sigmoid variant function to manipulate +gradients tactfully and train our neural networks by equipping with a loss +function defined in gradient domain. The second challenge is that actual +photographs often across different light conditions. We used image-based +rendering technique to address this challenge and acquire rendering images and +depth data under different lighting conditions. To make a clear division of +labor in network modules, a two-scale architecture is proposed to create +high-quality relief from a single photograph. Extensive experimental results on +a variety of scenes show that our method is a highly effective solution for +generating digital 2.5D artwork from photographs. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ☆ ParGANDA: Making Synthetic Pedestrians A Reality For Object Detection + + +
+ Object detection is the key technique to a number of Computer Vision +applications, but it often requires large amounts of annotated data to achieve +decent results. Moreover, for pedestrian detection specifically, the collected +data might contain some personally identifiable information (PII), which is +highly restricted in many countries. This label intensive and privacy +concerning task has recently led to an increasing interest in training the +detection models using synthetically generated pedestrian datasets collected +with a photo-realistic video game engine. The engine is able to generate +unlimited amounts of data with precise and consistent annotations, which gives +potential for significant gains in the real-world applications. However, the +use of synthetic data for training introduces a synthetic-to-real domain shift +aggravating the final performance. To close the gap between the real and +synthetic data, we propose to use a Generative Adversarial Network (GAN), which +performsparameterized unpaired image-to-image translation to generate more +realistic images. The key benefit of using the GAN is its intrinsic preference +of low-level changes to geometric ones, which means annotations of a given +synthetic image remain accurate even after domain translation is performed thus +eliminating the need for labeling real data. We extensively experimented with +the proposed method using MOTSynth dataset to train and MOT17 and MOT20 +detection datasets to test, with experimental results demonstrating the +effectiveness of this method. Our approach not only produces visually plausible +samples but also does not require any labels of the real domain thus making it +applicable to the variety of downstream tasks. + +
+
+
+
+
+ + ☆ Tuning Pre-trained Model via Moment Probing ICCV 2023 + + +
+ Recently, efficient fine-tuning of large-scale pre-trained models has +attracted increasing research interests, where linear probing (LP) as a +fundamental module is involved in exploiting the final representations for +task-dependent classification. However, most of the existing methods focus on +how to effectively introduce a few of learnable parameters, and little work +pays attention to the commonly used LP module. In this paper, we propose a +novel Moment Probing (MP) method to further explore the potential of LP. +Distinguished from LP which builds a linear classification head based on the +mean of final features (e.g., word tokens for ViT) or classification tokens, +our MP performs a linear classifier on feature distribution, which provides the +stronger representation ability by exploiting richer statistical information +inherent in features. Specifically, we represent feature distribution by its +characteristic function, which is efficiently approximated by using first- and +second-order moments of features. Furthermore, we propose a multi-head +convolutional cross-covariance (MHC$^3$) to compute second-order moments in an +efficient and effective manner. By considering that MP could affect feature +learning, we introduce a partially shared module to learn two recalibrating +parameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive +experiments on ten benchmarks using various models show that our MP +significantly outperforms LP and is competitive with counterparts at less +training cost, while our MP$_{+}$ achieves state-of-the-art performance. + +
+
+ comment: Accepted to ICCV 2023; Project Page: + https://github.com/mingzeG/Moment-Probing +
+
+
+
+
+ + ☆ Character Time-series Matching For Robust License Plate Recognition + + +
+ Automatic License Plate Recognition (ALPR) is becoming a popular study area +and is applied in many fields such as transportation or smart city. However, +there are still several limitations when applying many current methods to +practical problems due to the variation in real-world situations such as light +changes, unclear License Plate (LP) characters, and image quality. Almost +recent ALPR algorithms process on a single frame, which reduces accuracy in +case of worse image quality. This paper presents methods to improve license +plate recognition accuracy by tracking the license plate in multiple frames. +First, the Adaptive License Plate Rotation algorithm is applied to correctly +align the detected license plate. Second, we propose a method called Character +Time-series Matching to recognize license plate characters from many +consequence frames. The proposed method archives high performance in the +UFPR-ALPR dataset which is \boldmath$96.7\%$ accuracy in real-time on RTX A5000 +GPU card. We also deploy the algorithm for the Vietnamese ALPR system. The +accuracy for license plate detection and character recognition are 0.881 and +0.979 $mAP^{test}$@.5 respectively. The source code is available at +https://github.com/chequanghuy/Character-Time-series-Matching.git + +
+
+
+
+
+ + ☆ Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural + Radiance Fields ICCV 2023 + + +
+ Despite the tremendous progress in neural radiance fields (NeRF), we still +face a dilemma of the trade-off between quality and efficiency, e.g., MipNeRF +presents fine-detailed and anti-aliased renderings but takes days for training, +while Instant-ngp can accomplish the reconstruction in a few minutes but +suffers from blurring or aliasing when rendering at various distances or +resolutions due to ignoring the sampling area. To this end, we propose a novel +Tri-Mip encoding that enables both instant reconstruction and anti-aliased +high-fidelity rendering for neural radiance fields. The key is to factorize the +pre-filtered 3D feature spaces in three orthogonal mipmaps. In this way, we can +efficiently perform 3D area sampling by taking advantage of 2D pre-filtered +feature maps, which significantly elevates the rendering quality without +sacrificing efficiency. To cope with the novel Tri-Mip representation, we +propose a cone-casting rendering technique to efficiently sample anti-aliased +3D features with the Tri-Mip encoding considering both pixel imaging and +observing distance. Extensive experiments on both synthetic and real-world +datasets demonstrate our method achieves state-of-the-art rendering quality and +reconstruction speed while maintaining a compact representation that reduces +25% model size compared against Instant-ngp. + +
+
+ comment: Accepted to ICCV 2023 Project page: + https://wbhu.github.io/projects/Tri-MipRF +
+
+
+
+
+ + ☆ Improving Transferability of Adversarial Examples via Bayesian Attacks + + +
+ This paper presents a substantial extension of our work published at ICLR. +Our ICLR work advocated for enhancing transferability in adversarial examples +by incorporating a Bayesian formulation into model parameters, which +effectively emulates the ensemble of infinitely many deep neural networks, +while, in this paper, we introduce a novel extension by incorporating the +Bayesian formulation into the model input as well, enabling the joint +diversification of both the model input and model parameters. Our empirical +findings demonstrate that: 1) the combination of Bayesian formulations for both +the model input and model parameters yields significant improvements in +transferability; 2) by introducing advanced approximations of the posterior +distribution over the model input, adversarial transferability achieves further +enhancement, surpassing all state-of-the-arts when attacking without model +fine-tuning. Moreover, we propose a principled approach to fine-tune model +parameters in such an extended Bayesian formulation. The derived optimization +objective inherently encourages flat minima in the parameter space and input +space. Extensive experiments demonstrate that our method achieves a new +state-of-the-art on transfer-based attacks, improving the average success rate +on ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with +our ICLR basic Bayesian method. We will make our code publicly available. + +
+
+
+
+
+ + ☆ HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework + + +
+ In the field of autonomous driving, 3D object detection is a very important +perception module. Although the current SOTA algorithm combines Camera and +Lidar sensors, limited by the high price of Lidar, the current mainstream +landing schemes are pure Camera sensors or Camera+Radar sensors. In this study, +we propose a new detection algorithm called HVDetFusion, which is a multi-modal +detection algorithm that not only supports pure camera data as input for +detection, but also can perform fusion input of radar data and camera data. The +camera stream does not depend on the input of Radar data, thus addressing the +downside of previous methods. In the pure camera stream, we modify the +framework of Bevdet4D for better perception and more efficient inference, and +this stream has the whole 3D detection output. Further, to incorporate the +benefits of Radar signals, we use the prior information of different object +positions to filter the false positive information of the original radar data, +according to the positioning information and radial velocity information +recorded by the radar sensors to supplement and fuse the BEV features generated +by the original camera data, and the effect is further improved in the process +of fusion training. Finally, HVDetFusion achieves the new state-of-the-art +67.4\% NDS on the challenging nuScenes test set among all camera-radar 3D +object detectors. The code is available at +https://github.com/HVXLab/HVDetFusion + +
+
+
+
+
+ + ☆ XLDA: Linear Discriminant Analysis for Scaling Continual Learning to + Extreme Classification at the Edge ICML 2023 + + +
+ Streaming Linear Discriminant Analysis (LDA) while proven in +Class-incremental Learning deployments at the edge with limited classes (upto +1000), has not been proven for deployment in extreme classification scenarios. +In this paper, we present: (a) XLDA, a framework for Class-IL in edge +deployment where LDA classifier is proven to be equivalent to FC layer +including in extreme classification scenarios, and (b) optimizations to enable +XLDA-based training and inference for edge deployment where there is a +constraint on available compute resources. We show up to 42x speed up using a +batched training approach and up to 5x inference speedup with nearest neighbor +search on extreme datasets like AliProducts (50k classes) and Google Landmarks +V2 (81k classes) + +
+
+ comment: Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop +
+
+
+
+
+ + ☆ Generating Image-Specific Text Improves Fine-grained Image + Classification + + +
+ Recent vision-language models outperform vision-only models on many image +classification tasks. However, because of the absence of paired text/image +descriptions, it remains difficult to fine-tune these models for fine-grained +image classification. In this work, we propose a method, GIST, for generating +image-specific fine-grained text descriptions from image-only datasets, and +show that these text descriptions can be used to improve classification. Key +parts of our method include 1. prompting a pretrained large language model with +domain-specific prompts to generate diverse fine-grained text descriptions for +each class and 2. using a pretrained vision-language model to match each image +to label-preserving text descriptions that capture relevant visual features in +the image. We demonstrate the utility of GIST by fine-tuning vision-language +models on the image-and-generated-text pairs to learn an aligned +vision-language representation space for improved classification. We evaluate +our learned representation space in full-shot and few-shot scenarios across +four diverse fine-grained classification datasets, each from a different +domain. Our method achieves an average improvement of $4.1\%$ in accuracy over +CLIP linear probes and an average of $1.1\%$ improvement in accuracy over the +previous state-of-the-art image-text classification method on the full-shot +datasets. Our method achieves similar improvements across few-shot regimes. +Code is available at https://github.com/emu1729/GIST. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ DPM-OT: A New Diffusion Probabilistic Model Based on Optimal Transport + + +
+ Sampling from diffusion probabilistic models (DPMs) can be viewed as a +piecewise distribution transformation, which generally requires hundreds or +thousands of steps of the inverse diffusion trajectory to get a high-quality +image. Recent progress in designing fast samplers for DPMs achieves a trade-off +between sampling speed and sample quality by knowledge distillation or +adjusting the variance schedule or the denoising equation. However, it can't be +optimal in both aspects and often suffer from mode mixture in short steps. To +tackle this problem, we innovatively regard inverse diffusion as an optimal +transport (OT) problem between latents at different stages and propose the +DPM-OT, a unified learning framework for fast DPMs with a direct expressway +represented by OT map, which can generate high-quality samples within around 10 +function evaluations. By calculating the semi-discrete optimal transport map +between the data latents and the white noise, we obtain an expressway from the +prior distribution to the data distribution, while significantly alleviating +the problem of mode mixture. In addition, we give the error bound of the +proposed method, which theoretically guarantees the stability of the algorithm. +Extensive experiments validate the effectiveness and advantages of DPM-OT in +terms of speed and quality (FID and mode mixture), thus representing an +efficient solution for generative modeling. Source codes are available at +https://github.com/cognaclee/DPM-OT + +
+
+ comment: iccv2023 accepted +
+
+
+
+
+ + ☆ EndoSurf: Neural Surface Reconstruction of Deformable Tissues with + Stereo Endoscope Videos MICCAI 2023 + + +
+ Reconstructing soft tissues from stereo endoscope videos is an essential +prerequisite for many medical applications. Previous methods struggle to +produce high-quality geometry and appearance due to their inadequate +representations of 3D scenes. To address this issue, we propose a novel +neural-field-based method, called EndoSurf, which effectively learns to +represent a deforming surface from an RGBD sequence. In EndoSurf, we model +surface dynamics, shape, and texture with three neural fields. First, 3D points +are transformed from the observed space to the canonical space using the +deformation field. The signed distance function (SDF) field and radiance field +then predict their SDFs and colors, respectively, with which RGBD images can be +synthesized via differentiable volume rendering. We constrain the learned shape +by tailoring multiple regularization strategies and disentangling geometry and +appearance. Experiments on public endoscope datasets demonstrate that EndoSurf +significantly outperforms existing solutions, particularly in reconstructing +high-fidelity shapes. Code is available at +https://github.com/Ruyi-Zha/endosurf.git. + +
+
+ comment: MICCAI 2023 (Early Accept); Ruyi Zha and Xuelian Cheng made equal + contributions. Corresponding author: Ruyi Zha (ruyi.zha@gmail.com) +
+
+
+
+
+ + ☆ MAS: Towards Resource-Efficient Federated Multiple-Task Learning ICCV'23 + + +
+ Federated learning (FL) is an emerging distributed machine learning method +that empowers in-situ model training on decentralized edge devices. However, +multiple simultaneous FL tasks could overload resource-constrained devices. In +this work, we propose the first FL system to effectively coordinate and train +multiple simultaneous FL tasks. We first formalize the problem of training +simultaneous FL tasks. Then, we present our new approach, MAS (Merge and +Split), to optimize the performance of training multiple simultaneous FL tasks. +MAS starts by merging FL tasks into an all-in-one FL task with a multi-task +architecture. After training for a few rounds, MAS splits the all-in-one FL +task into two or more FL tasks by using the affinities among tasks measured +during the all-in-one training. It then continues training each split of FL +tasks based on model parameters from the all-in-one training. Extensive +experiments demonstrate that MAS outperforms other methods while reducing +training time by 2x and reducing energy consumption by 40%. We hope this work +will inspire the community to further study and optimize training simultaneous +FL tasks. + +
+
+ comment: ICCV'23. arXiv admin note: substantial text overlap with + arXiv:2207.04202 +
+
+
+
+
+ + ☆ Screening Mammography Breast Cancer Detection + + +
+ Breast cancer is a leading cause of cancer-related deaths, but current +programs are expensive and prone to false positives, leading to unnecessary +follow-up and patient anxiety. This paper proposes a solution to automated +breast cancer detection, to improve the efficiency and accuracy of screening +programs. Different methodologies were tested against the RSNA dataset of +radiographic breast images of roughly 20,000 female patients and yielded an +average validation case pF1 score of 0.56 across methods. + +
+
+ comment: Released @ Apr 2023. For associated project files, see + https://github.com/chakrabortyde/rsna-breast-cancer +
+
+
+
+
+ + ♻ ☆ Terabyte-scale supervised 3D training and benchmarking dataset of the + mouse kidney + + +
+ The performance of machine learning algorithms, when used for segmenting 3D +biomedical images, does not reach the level expected based on results achieved +with 2D photos. This may be explained by the comparative lack of high-volume, +high-quality training datasets, which require state-of-the-art imaging +facilities, domain experts for annotation and large computational and personal +resources. The HR-Kidney dataset presented in this work bridges this gap by +providing 1.7 TB of artefact-corrected synchrotron radiation-based X-ray +phase-contrast microtomography images of whole mouse kidneys and validated +segmentations of 33 729 glomeruli, which corresponds to a one to two orders of +magnitude increase over currently available biomedical datasets. The image sets +also contain the underlying raw data, threshold- and morphology-based +semi-automatic segmentations of renal vasculature and uriniferous tubules, as +well as true 3D manual annotations. We therewith provide a broad basis for the +scientific community to build upon and expand in the fields of image +processing, data augmentation and machine learning, in particular unsupervised +and semi-supervised learning investigations, as well as transfer learning and +generative adversarial networks. + +
+
+
+
+
+ + ♻ ☆ More From Less: Self-Supervised Knowledge Distillation for Routine + Histopathology Data + + +
+ Medical imaging technologies are generating increasingly large amounts of +high-quality, information-dense data. Despite the progress, practical use of +advanced imaging technologies for research and diagnosis remains limited by +cost and availability, so information-sparse data such as H&E stains are relied +on in practice. The study of diseased tissue requires methods which can +leverage these information-dense data to extract more value from routine, +information-sparse data. Using self-supervised deep learning, we demonstrate +that it is possible to distil knowledge during training from information-dense +data into models which only require information-sparse data for inference. This +improves downstream classification accuracy on information-sparse data, making +it comparable with the fully-supervised baseline. We find substantial effects +on the learned representations, and this training process identifies subtle +features which otherwise go undetected. This approach enables the design of +models which require only routine images, but contain insights from +state-of-the-art data, allowing better use of the available resources. + +
+
+
+
+
+ + ♻ ☆ Enhancing Few-shot Image Classification with Cosine Transformer + + +
+ This paper addresses the few-shot image classification problem, where the +classification task is performed on unlabeled query samples given a small +amount of labeled support samples only. One major challenge of the few-shot +learning problem is the large variety of object visual appearances that +prevents the support samples to represent that object comprehensively. This +might result in a significant difference between support and query samples, +therefore undermining the performance of few-shot algorithms. In this paper, we +tackle the problem by proposing Few-shot Cosine Transformer (FS-CT), where the +relational map between supports and queries is effectively obtained for the +few-shot tasks. The FS-CT consists of two parts, a learnable prototypical +embedding network to obtain categorical representations from support samples +with hard cases, and a transformer encoder to effectively achieve the +relational map from two different support and query samples. We introduce +Cosine Attention, a more robust and stable attention module that enhances the +transformer module significantly and therefore improves FS-CT performance from +5% to over 20% in accuracy compared to the default scaled dot-product +mechanism. Our method performs competitive results in mini-ImageNet, CUB-200, +and CIFAR-FS on 1-shot learning and 5-shot learning tasks across backbones and +few-shot configurations. We also developed a custom few-shot dataset for Yoga +pose recognition to demonstrate the potential of our algorithm for practical +application. Our FS-CT with cosine attention is a lightweight, simple few-shot +algorithm that can be applied for a wide range of applications, such as +healthcare, medical, and security surveillance. The official implementation +code of our Few-shot Cosine Transformer is available at +https://github.com/vinuni-vishc/Few-Shot-Cosine-Transformer + +
+
+
+
+
+ + ♻ ☆ Automated wildlife image classification: An active learning tool for + ecological applications + + +
+ Wildlife camera trap images are being used extensively to investigate animal +abundance, habitat associations, and behavior, which is complicated by the fact +that experts must first classify the images manually. Artificial intelligence +systems can take over this task but usually need a large number of +already-labeled training images to achieve sufficient performance. This +requirement necessitates human expert labor and poses a particular challenge +for projects with few cameras or short durations. We propose a label-efficient +learning strategy that enables researchers with small or medium-sized image +databases to leverage the potential of modern machine learning, thus freeing +crucial resources for subsequent analyses. + Our methodological proposal is two-fold: (1) We improve current strategies of +combining object detection and image classification by tuning the +hyperparameters of both models. (2) We provide an active learning (AL) system +that allows training deep learning models very efficiently in terms of required +human-labeled training images. We supply a software package that enables +researchers to use these methods directly and thereby ensure the broad +applicability of the proposed framework in ecological practice. + We show that our tuning strategy improves predictive performance. We +demonstrate how the AL pipeline reduces the amount of pre-labeled data needed +to achieve a specific predictive performance and that it is especially valuable +for improving out-of-sample predictive performance. + We conclude that the combination of tuning and AL increases predictive +performance substantially. Furthermore, we argue that our work can broadly +impact the community through the ready-to-use software package provided. +Finally, the publication of our models tailored to European wildlife data +enriches existing model bases mostly trained on data from Africa and North +America. + +
+
+
+
+
+ + ♻ ☆ MOISST: Multimodal Optimization of Implicit Scene for SpatioTemporal + calibration IROS2023 + + +
+ With the recent advances in autonomous driving and the decreasing cost of +LiDARs, the use of multimodal sensor systems is on the rise. However, in order +to make use of the information provided by a variety of complimentary sensors, +it is necessary to accurately calibrate them. We take advantage of recent +advances in computer graphics and implicit volumetric scene representation to +tackle the problem of multi-sensor spatial and temporal calibration. Thanks to +a new formulation of the Neural Radiance Field (NeRF) optimization, we are able +to jointly optimize calibration parameters along with scene representation +based on radiometric and geometric measurements. Our method enables accurate +and robust calibration from data captured in uncontrolled and unstructured +urban environments, making our solution more scalable than existing calibration +solutions. We demonstrate the accuracy and robustness of our method in urban +scenes typically encountered in autonomous driving scenarios. + +
+
+ comment: Accepted at IROS2023 Project site: https://qherau.github.io/MOISST/ +
+
+
+
+
+ + ♻ ☆ Conditional Diffusion Models for Semantic 3D Medical Image Synthesis + + +
+ The demand for artificial intelligence (AI) in healthcare is rapidly +increasing. However, significant challenges arise from data scarcity and +privacy concerns, particularly in medical imaging. While existing generative +models have achieved success in image synthesis and image-to-image translation +tasks, there remains a gap in the generation of 3D semantic medical images. To +address this gap, we introduce Med-DDPM, a diffusion model specifically +designed for semantic 3D medical image synthesis, effectively tackling data +scarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation +of semantic conditioning, enabling precise control during the image generation +process. Our model outperforms Generative Adversarial Networks (GANs) in terms +of stability and performance, generating diverse and anatomically coherent +images with high visual fidelity. Comparative analysis against state-of-the-art +augmentation techniques demonstrates that Med-DDPM produces comparable results, +highlighting its potential as a data augmentation tool for enhancing model +accuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis +by delivering high-quality and anatomically coherent images. Furthermore, the +integration of semantic conditioning with Med-DDPM holds promise for image +anonymization in the field of biomedical imaging, showcasing the capabilities +of the model in addressing challenges related to data scarcity and privacy +concerns. + +
+
+
+
+
+ + ♻ ☆ Learning Foresightful Dense Visual Affordance for Deformable Object + Manipulation ICCV 2023 + + +
+ Understanding and manipulating deformable objects (e.g., ropes and fabrics) +is an essential yet challenging task with broad applications. Difficulties come +from complex states and dynamics, diverse configurations and high-dimensional +action space of deformable objects. Besides, the manipulation tasks usually +require multiple steps to accomplish, and greedy policies may easily lead to +local optimal states. Existing studies usually tackle this problem using +reinforcement learning or imitating expert demonstrations, with limitations in +modeling complex states or requiring hand-crafted expert policies. In this +paper, we study deformable object manipulation using dense visual affordance, +with generalization towards diverse states, and propose a novel kind of +foresightful dense affordance, which avoids local optima by estimating states' +values for long-term manipulation. We propose a framework for learning this +representation, with novel designs such as multi-stage stable learning and +efficient self-supervised data collection without experts. Experiments +demonstrate the superiority of our proposed foresightful dense affordance. +Project page: https://hyperplane-lab.github.io/DeformableAffordance + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ BoxSnake: Polygonal Instance Segmentation with Box Supervision ICCV 2023 + + +
+ Box-supervised instance segmentation has gained much attention as it requires +only simple box annotations instead of costly mask or polygon annotations. +However, existing box-supervised instance segmentation models mainly focus on +mask-based frameworks. We propose a new end-to-end training technique, termed +BoxSnake, to achieve effective polygonal instance segmentation using only box +annotations for the first time. Our method consists of two loss functions: (1) +a point-based unary loss that constrains the bounding box of predicted polygons +to achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss +that encourages the predicted polygons to fit the object boundaries. Compared +with the mask-based weakly-supervised methods, BoxSnake further reduces the +performance gap between the predicted segmentation and the bounding box, and +shows significant superiority on the Cityscapes dataset. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Transformer-based end-to-end classification of variable-length + volumetric data + + +
+ The automatic classification of 3D medical data is memory-intensive. Also, +variations in the number of slices between samples is common. Na\"ive solutions +such as subsampling can solve these problems, but at the cost of potentially +eliminating relevant diagnosis information. Transformers have shown promising +performance for sequential data analysis. However, their application for long +sequences is data, computationally, and memory demanding. In this paper, we +propose an end-to-end Transformer-based framework that allows to classify +volumetric data of variable length in an efficient fashion. Particularly, by +randomizing the input volume-wise resolution(#slices) during training, we +enhance the capacity of the learnable positional embedding assigned to each +volume slice. Consequently, the accumulated positional information in each +positional embedding can be generalized to the neighbouring slices, even for +high-resolution volumes at the test time. By doing so, the model will be more +robust to variable volume length and amenable to different computational +budgets. We evaluated the proposed approach in retinal OCT volume +classification and achieved 21.96% average improvement in balanced accuracy on +a 9-class diagnostic task, compared to state-of-the-art video transformers. Our +findings show that varying the volume-wise resolution of the input during +training results in more informative volume representation as compared to +training with fixed number of slices per volume. + +
+
+
+
+
+ + ♻ ☆ VERITE: A Robust Benchmark for Multimodal Misinformation Detection + Accounting for Unimodal Bias + + +
+ Multimedia content has become ubiquitous on social media platforms, leading +to the rise of multimodal misinformation (MM) and the urgent need for effective +strategies to detect and prevent its spread. In recent years, the challenge of +multimodal misinformation detection (MMD) has garnered significant attention by +researchers and has mainly involved the creation of annotated, weakly +annotated, or synthetically generated training datasets, along with the +development of various deep learning MMD models. However, the problem of +unimodal bias in MMD benchmarks -- where biased or unimodal methods outperform +their multimodal counterparts on an inherently multimodal task -- has been +overlooked. In this study, we systematically investigate and identify the +presence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS), +raising concerns about their suitability for reliable evaluation. To address +this issue, we introduce the "VERification of Image-TExtpairs" (VERITE) +benchmark for MMD which incorporates real-world data, excludes "asymmetric +multimodal misinformation" and utilizes "modality balancing". We conduct an +extensive comparative study with a Transformer-based architecture that shows +the ability of VERITE to effectively address unimodal bias, rendering it a +robust evaluation framework for MMD. Furthermore, we introduce a new method -- +termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating +realistic synthetic training data that preserve crossmodal relations between +legitimate images and false human-written captions. By leveraging CHASMA in the +training process, we observe consistent and notable improvements in predictive +performance on VERITE; with a 9.2% increase in accuracy. We release our code +at: https://github.com/stevejpapad/image-text-verification + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Hyperspectral Inpainting with the Optimisation inspired + Deep Neural Network Prior SC + + +
+ Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral +bands, conveying a wealth of spatial and spectral information. However, due to +the instrumental errors and the atmospheric changes, the HSI obtained in +practice are often contaminated by noise and dead pixels(lines), resulting in +missing information that may severely compromise the subsequent applications. +We introduce here a novel HSI missing pixel prediction algorithm, called Low +Rank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP +is able to predict missing pixels and bands even when all spectral bands of the +image are missing. The proposed LRS-PnP algorithm is further extended to a +self-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP), +called LRS-PnP-DIP. In a series of experiments with real data, It is shown that +the LRS-PnP-DIP either achieves state-of-the-art inpainting performance +compared to other learning-based methods, or outperforms them. + +
+
+ comment: Presented in ISCS23 +
+
+
+
+
+ + ♻ ☆ Semantic Self-adaptation: Enhancing Generalization with a Single Sample + + +
+ The lack of out-of-domain generalization is a critical weakness of deep +networks for semantic segmentation. Previous studies relied on the assumption +of a static model, i. e., once the training process is complete, model +parameters remain fixed at test time. In this work, we challenge this premise +with a self-adaptive approach for semantic segmentation that adjusts the +inference process to each input sample. Self-adaptation operates on two levels. +First, it fine-tunes the parameters of convolutional layers to the input image +using consistency regularization. Second, in Batch Normalization layers, +self-adaptation interpolates between the training and the reference +distribution derived from a single test sample. Despite both techniques being +well known in the literature, their combination sets new state-of-the-art +accuracy on synthetic-to-real generalization benchmarks. Our empirical study +suggests that self-adaptation may complement the established practice of model +regularization at training time for improving deep network generalization to +out-of-domain data. Our code and pre-trained models are available at +https://github.com/visinf/self-adaptive. + +
+
+ comment: Published in TMLR (July 2023); OpenReview: + https://openreview.net/forum?id=ILNqQhGbLx; Code: + https://github.com/visinf/self-adaptive; Video: https://youtu.be/s4DG65ic0EA +
+
+
+
+
+ + ♻ ☆ MSKdeX: Musculoskeletal (MSK) decomposition from an X-ray image for + fine-grained estimation of lean muscle mass and muscle volume MICCAI 2023 + + +
+ Musculoskeletal diseases such as sarcopenia and osteoporosis are major +obstacles to health during aging. Although dual-energy X-ray absorptiometry +(DXA) and computed tomography (CT) can be used to evaluate musculoskeletal +conditions, frequent monitoring is difficult due to the cost and accessibility +(as well as high radiation exposure in the case of CT). We propose a method +(named MSKdeX) to estimate fine-grained muscle properties from a plain X-ray +image, a low-cost, low-radiation, and highly accessible imaging modality, +through musculoskeletal decomposition leveraging fine-grained segmentation in +CT. We train a multi-channel quantitative image translation model to decompose +an X-ray image into projections of CT of individual muscles to infer the lean +muscle mass and muscle volume. We propose the object-wise intensity-sum loss, a +simple yet surprisingly effective metric invariant to muscle deformation and +projection direction, utilizing information in CT and X-ray images collected +from the same patient. While our method is basically an unpaired image-to-image +translation, we also exploit the nature of the bone's rigidity, which provides +the paired data through 2D-3D rigid registration, adding strong pixel-wise +supervision in unpaired training. Through the evaluation using a 539-patient +dataset, we showed that the proposed method significantly outperformed +conventional methods. The average Pearson correlation coefficient between the +predicted and CT-derived ground truth metrics was increased from 0.460 to +0.863. We believe our method opened up a new musculoskeletal diagnosis method +and has the potential to be extended to broader applications in multi-channel +quantitative image translation tasks. Our source code will be released soon. + +
+
+ comment: MICCAI 2023 early acceptance (12 pages and 6 figures) +
+
+
+
+
+ + ♻ ☆ Continual Learning for Abdominal Multi-Organ and Tumor Segmentation MICCAI-2023 + + +
+ The ability to dynamically extend a model to new data and classes is critical +for multiple organ and tumor segmentation. However, due to privacy regulations, +accessing previous data and annotations can be problematic in the medical +domain. This poses a significant barrier to preserving the high segmentation +accuracy of the old classes when learning from new classes because of the +catastrophic forgetting problem. In this paper, we first empirically +demonstrate that simply using high-quality pseudo labels can fairly mitigate +this problem in the setting of organ segmentation. Furthermore, we put forward +an innovative architecture designed specifically for continuous organ and tumor +segmentation, which incurs minimal computational overhead. Our proposed design +involves replacing the conventional output layer with a suite of lightweight, +class-specific heads, thereby offering the flexibility to accommodate newly +emerging classes. These heads enable independent predictions for newly +introduced and previously learned classes, effectively minimizing the impact of +new classes on old ones during the course of continual learning. We further +propose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings +into the organ-specific heads. These embeddings encapsulate the semantic +information of each class, informed by extensive image-text co-training. The +proposed method is evaluated on both in-house and public abdominal CT datasets +under organ and tumor segmentation tasks. Empirical results suggest that the +proposed design improves the segmentation performance of a baseline neural +network on newly-introduced and previously-learned classes along the learning +trajectory. + +
+
+ comment: MICCAI-2023 +
+
+
+
+
+ + ♻ ☆ Score-Based Generative Models for Medical Image Segmentation using + Signed Distance Functions + + +
+ Medical image segmentation is a crucial task that relies on the ability to +accurately identify and isolate regions of interest in medical images. Thereby, +generative approaches allow to capture the statistical properties of +segmentation masks that are dependent on the respective structures. In this +work we propose a conditional score-based generative modeling framework to +represent the signed distance function (SDF) leading to an implicit +distribution of segmentation masks. The advantage of leveraging the SDF is a +more natural distortion when compared to that of binary masks. By learning the +score function of the conditional distribution of SDFs we can accurately sample +from the distribution of segmentation masks, allowing for the evaluation of +statistical quantities. Thus, this probabilistic representation allows for the +generation of uncertainty maps represented by the variance, which can aid in +further analysis and enhance the predictive robustness. We qualitatively and +quantitatively illustrate competitive performance of the proposed method on a +public nuclei and gland segmentation data set, highlighting its potential +utility in medical image segmentation applications. + +
+
+
+
+
+ + ♻ ☆ FedForgery: Generalized Face Forgery Detection with Residual Federated + Learning + + +
+ With the continuous development of deep learning in the field of image +generation models, a large number of vivid forged faces have been generated and +spread on the Internet. These high-authenticity artifacts could grow into a +threat to society security. Existing face forgery detection methods directly +utilize the obtained public shared or centralized data for training but ignore +the personal privacy and security issues when personal data couldn't be +centralizedly shared in real-world scenarios. Additionally, different +distributions caused by diverse artifact types would further bring adverse +influences on the forgery detection task. To solve the mentioned problems, the +paper proposes a novel generalized residual Federated learning for face Forgery +detection (FedForgery). The designed variational autoencoder aims to learn +robust discriminative residual feature maps to detect forgery faces (with +diverse or even unknown artifact types). Furthermore, the general federated +learning strategy is introduced to construct distributed detection model +trained collaboratively with multiple local decentralized devices, which could +further boost the representation generalization. Experiments conducted on +publicly available face forgery detection datasets prove the superior +performance of the proposed FedForgery. The designed novel generalized face +forgery detection protocols and source code would be publicly available. + +
+
+ comment: The code is available at https://github.com/GANG370/FedForgery. The + paper has been accepted in the IEEE Transactions on Information Forensics & + Security +
+
+
+
+
+ + ♻ ☆ Confidence intervals for performance estimates in 3D medical image + segmentation + + +
+ Medical segmentation models are evaluated empirically. As such an evaluation +is based on a limited set of example images, it is unavoidably noisy. Beyond a +mean performance measure, reporting confidence intervals is thus crucial. +However, this is rarely done in medical image segmentation. The width of the +confidence interval depends on the test set size and on the spread of the +performance measure (its standard-deviation across of the test set). For +classification, many test images are needed to avoid wide confidence intervals. +Segmentation, however, has not been studied, and it differs by the amount of +information brought by a given test image. In this paper, we study the typical +confidence intervals in medical image segmentation. We carry experiments on 3D +image segmentation using the standard nnU-net framework, two datasets from the +Medical Decathlon challenge and two performance measures: the Dice accuracy and +the Hausdorff distance. We show that the parametric confidence intervals are +reasonable approximations of the bootstrap estimates for varying test set sizes +and spread of the performance metric. Importantly, we show that the test size +needed to achieve a given precision is often much lower than for classification +tasks. Typically, a 1% wide confidence interval requires about 100-200 test +samples when the spread is low (standard-deviation around 3%). More difficult +segmentation tasks may lead to higher spreads and require over 1000 samples. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Gait Data Augmentation using Physics-Based Biomechanical Simulation + + +
+ This paper focuses on addressing the problem of data scarcity for gait +analysis. Standard augmentation methods may produce gait sequences that are not +consistent with the biomechanical constraints of human walking. To address this +issue, we propose a novel framework for gait data augmentation by using +OpenSIM, a physics-based simulator, to synthesize biomechanically plausible +walking sequences. The proposed approach is validated by augmenting the WBDS +and CASIA-B datasets and then training gait-based classifiers for 3D gender +gait classification and 2D gait person identification respectively. +Experimental results indicate that our augmentation approach can improve the +performance of model-based gait classifiers and deliver state-of-the-art +results for gait-based person identification with an accuracy of up to 96.11% +on the CASIA-B dataset. + +
+
+ comment: 30 pages including references, 5 Figures submitted to ESWA +
+
+
+
+
+ + ♻ ☆ SegNetr: Rethinking the local-global interactions and skip connections + in U-shaped networks + + +
+ Recently, U-shaped networks have dominated the field of medical image +segmentation due to their simple and easily tuned structure. However, existing +U-shaped segmentation networks: 1) mostly focus on designing complex +self-attention modules to compensate for the lack of long-term dependence based +on convolution operation, which increases the overall number of parameters and +computational complexity of the network; 2) simply fuse the features of encoder +and decoder, ignoring the connection between their spatial locations. In this +paper, we rethink the above problem and build a lightweight medical image +segmentation network, called SegNetr. Specifically, we introduce a novel +SegNetr block that can perform local-global interactions dynamically at any +stage and with only linear complexity. At the same time, we design a general +information retention skip connection (IRSC) to preserve the spatial location +information of encoder features and achieve accurate fusion with the decoder +features. We validate the effectiveness of SegNetr on four mainstream medical +image segmentation datasets, with 59\% and 76\% fewer parameters and GFLOPs +than vanilla U-Net, while achieving segmentation performance comparable to +state-of-the-art methods. Notably, the components proposed in this paper can be +applied to other U-shaped networks to improve their segmentation performance. + +
+
+
+
+
+ + ♻ ☆ Shortcut Detection with Variational Autoencoders ICML 2023 + + +
+ For real-world applications of machine learning (ML), it is essential that +models make predictions based on well-generalizing features rather than +spurious correlations in the data. The identification of such spurious +correlations, also known as shortcuts, is a challenging problem and has so far +been scarcely addressed. In this work, we present a novel approach to detect +shortcuts in image and audio datasets by leveraging variational autoencoders +(VAEs). The disentanglement of features in the latent space of VAEs allows us +to discover feature-target correlations in datasets and semi-automatically +evaluate them for ML shortcuts. We demonstrate the applicability of our method +on several real-world datasets and identify shortcuts that have not been +discovered before. + +
+
+ comment: Accepted at the ICML 2023 Workshop on Spurious Correlations, + Invariance and Stability +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Diabetic Retinopathy Grading in Unseen Domains MICCAI 2023 + + +
+ Diabetic Retinopathy (DR) is a common complication of diabetes and a leading +cause of blindness worldwide. Early and accurate grading of its severity is +crucial for disease management. Although deep learning has shown great +potential for automated DR grading, its real-world deployment is still +challenging due to distribution shifts among source and target domains, known +as the domain generalization problem. Existing works have mainly attributed the +performance degradation to limited domain shifts caused by simple visual +discrepancies, which cannot handle complex real-world scenarios. Instead, we +present preliminary evidence suggesting the existence of three-fold +generalization issues: visual and degradation style shifts, diagnostic pattern +diversity, and data imbalance. To tackle these issues, we propose a novel +unified framework named Generalizable Diabetic Retinopathy Grading Network +(GDRNet). GDRNet consists of three vital components: fundus visual-artifact +augmentation (FundusAug), dynamic hybrid-supervised loss (DahLoss), and +domain-class-aware re-balancing (DCR). FundusAug generates realistic augmented +images via visual transformation and image degradation, while DahLoss jointly +leverages pixel-level consistency and image-level semantics to capture the +diverse diagnostic patterns and build generalizable feature representations. +Moreover, DCR mitigates the data imbalance from a domain-class view and avoids +undesired over-emphasis on rare domain-class pairs. Finally, we design a +publicly available benchmark for fair evaluations. Extensive comparison +experiments against advanced methods and exhaustive ablation studies +demonstrate the effectiveness and generalization ability of GDRNet. + +
+
+ comment: Early Accepted by MICCAI 2023, the 26th International Conference on + Medical Image Computing and Computer Assisted Intervention +
+
+
+
+
+ + ♻ ☆ Motion-Scenario Decoupling for Rat-Aware Video Position Prediction: + Strategy and Benchmark + + +
+ Recently significant progress has been made in human action recognition and +behavior prediction using deep learning techniques, leading to improved +vision-based semantic understanding. However, there is still a lack of +high-quality motion datasets for small bio-robotics, which presents more +challenging scenarios for long-term movement prediction and behavior control +based on third-person observation. In this study, we introduce RatPose, a +bio-robot motion prediction dataset constructed by considering the influence +factors of individuals and environments based on predefined annotation rules. +To enhance the robustness of motion prediction against these factors, we +propose a Dual-stream Motion-Scenario Decoupling (\textit{DMSD}) framework that +effectively separates scenario-oriented and motion-oriented features and +designs a scenario contrast loss and motion clustering loss for overall +training. With such distinctive architecture, the dual-branch feature flow +information is interacted and compensated in a decomposition-then-fusion +manner. Moreover, we demonstrate significant performance improvements of the +proposed \textit{DMSD} framework on different difficulty-level tasks. We also +implement long-term discretized trajectory prediction tasks to verify the +generalization ability of the proposed dataset. + +
+
+ comment: Rat, Video Position Prediction +
+
+
+
+
+ + ♻ ☆ MedNeXt: Transformer-driven Scaling of ConvNets for Medical Image + Segmentation MICCAI 2023 + + +
+ There has been exploding interest in embracing Transformer-based +architectures for medical image segmentation. However, the lack of large-scale +annotated medical datasets make achieving performances equivalent to those in +natural images challenging. Convolutional networks, in contrast, have higher +inductive biases and consequently, are easily trainable to high performance. +Recently, the ConvNeXt architecture attempted to modernize the standard ConvNet +by mirroring Transformer blocks. In this work, we improve upon this to design a +modernized and scalable convolutional architecture customized to challenges of +data-scarce medical settings. We introduce MedNeXt, a Transformer-inspired +large kernel segmentation network which introduces - 1) A fully ConvNeXt 3D +Encoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up +and downsampling blocks to preserve semantic richness across scales, 3) A novel +technique to iteratively increase kernel sizes by upsampling small kernel +networks, to prevent performance saturation on limited medical data, 4) +Compound scaling at multiple levels (depth, width, kernel size) of MedNeXt. +This leads to state-of-the-art performance on 4 tasks on CT and MRI modalities +and varying dataset sizes, representing a modernized deep architecture for +medical image segmentation. Our code is made publicly available at: +https://github.com/MIC-DKFZ/MedNeXt. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Ord2Seq: Regarding Ordinal Regression as Label Sequence Prediction ICCV2023 + + +
+ Ordinal regression refers to classifying object instances into ordinal +categories. It has been widely studied in many scenarios, such as medical +disease grading, movie rating, etc. Known methods focused only on learning +inter-class ordinal relationships, but still incur limitations in +distinguishing adjacent categories thus far. In this paper, we propose a simple +sequence prediction framework for ordinal regression called Ord2Seq, which, for +the first time, transforms each ordinal category label into a special label +sequence and thus regards an ordinal regression task as a sequence prediction +process. In this way, we decompose an ordinal regression task into a series of +recursive binary classification steps, so as to subtly distinguish adjacent +categories. Comprehensive experiments show the effectiveness of distinguishing +adjacent categories for performance improvement and our new approach exceeds +state-of-the-art performances in four different scenarios. Codes are available +at https://github.com/wjh892521292/Ord2Seq. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ SiamixFormer: a fully-transformer Siamese network with temporal Fusion + for accurate building detection and change detection in bi-temporal remote + sensing images + + +
+ Building detection and change detection using remote sensing images can help +urban and rescue planning. Moreover, they can be used for building damage +assessment after natural disasters. Currently, most of the existing models for +building detection use only one image (pre-disaster image) to detect buildings. +This is based on the idea that post-disaster images reduce the model's +performance because of presence of destroyed buildings. In this paper, we +propose a siamese model, called SiamixFormer, which uses pre- and post-disaster +images as input. Our model has two encoders and has a hierarchical transformer +architecture. The output of each stage in both encoders is given to a temporal +transformer for feature fusion in a way that query is generated from +pre-disaster images and (key, value) is generated from post-disaster images. To +this end, temporal features are also considered in feature fusion. Another +advantage of using temporal transformers in feature fusion is that they can +better maintain large receptive fields generated by transformer encoders +compared with CNNs. Finally, the output of the temporal transformer is given to +a simple MLP decoder at each stage. The SiamixFormer model is evaluated on xBD, +and WHU datasets, for building detection and on LEVIR-CD and CDD datasets for +change detection and could outperform the state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network + + +
+ Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent +blur is a challenging task.~Existing blur map-based deblurring methods have +demonstrated promising results. In this paper, we propose, to the best of our +knowledge, the first framework to introduce the contrastive language-image +pre-training framework (CLIP) to achieve accurate blur map estimation from DP +pairs unsupervisedly. To this end, we first carefully design text prompts to +enable CLIP to understand blur-related geometric prior knowledge from the DP +pair. Then, we propose a format to input stereo DP pair to the CLIP without any +fine-tuning, where the CLIP is pre-trained on monocular images. Given the +estimated blur map, we introduce a blur-prior attention block, a blur-weighting +loss and a blur-aware loss to recover the all-in-focus image. Our method +achieves state-of-the-art performance in extensive experiments. + +
+
+
+
+
+ + ♻ ☆ Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced + Perception based on Joint-Embedding & Contextual Label Affinity + + +
+ Traditional computer vision models often require extensive manual effort for +data acquisition, annotation and validation, particularly when detecting subtle +behavioral nuances or events. The difficulty in distinguishing routine +behaviors from potential risks in real-world applications, such as +differentiating routine shopping from potential shoplifting, further +complicates the process. Moreover, these models may demonstrate high false +positive rates and imprecise event detection when exposed to real-world +scenarios that differ significantly from the conditions of the training data. + To overcome these hurdles, we present Ethosight, a novel zero-shot computer +vision system. Ethosight initiates with a clean slate based on user +requirements and semantic knowledge of interest. Using localized label affinity +calculations and a reasoning-guided iterative learning loop, Ethosight infers +scene details and iteratively refines the label set. Reasoning mechanisms can +be derived from large language models like GPT4, symbolic reasoners like +OpenNARS\cite{wang2013}\cite{wang2006}, or hybrid systems. + Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases, +spanning domains such as health, safety, and security. Detailed results and +case studies within the main body of this paper and an appendix underscore a +promising trajectory towards enhancing the adaptability and resilience of +computer vision models in detecting and extracting subtle and nuanced +behaviors. + +
+
+
+
+
+ + ♻ ☆ StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces ICCV 2023 + + +
+ Recent advances in face manipulation using StyleGAN have produced impressive +results. However, StyleGAN is inherently limited to cropped aligned faces at a +fixed image resolution it is pre-trained on. In this paper, we propose a simple +and effective solution to this limitation by using dilated convolutions to +rescale the receptive fields of shallow layers in StyleGAN, without altering +any model parameters. This allows fixed-size small features at shallow layers +to be extended into larger ones that can accommodate variable resolutions, +making them more robust in characterizing unaligned faces. To enable real face +inversion and manipulation, we introduce a corresponding encoder that provides +the first-layer feature of the extended StyleGAN in addition to the latent +style code. We validate the effectiveness of our method using unaligned face +inputs of various resolutions in a diverse set of face manipulation tasks, +including facial attribute editing, super-resolution, sketch/mask-to-face +translation, and face toonification. + +
+
+ comment: ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX + Project page: https://www.mmlab-ntu.com/project/styleganex/ +
+
+
+
+
+ + ♻ ☆ FSD: Fully-Specialized Detector via Neural Architecture Search + + +
+ Most generic object detectors are mainly built for standard object detection +tasks such as COCO and PASCAL VOC. They might not work well and/or efficiently +on tasks of other domains consisting of images that are visually different from +standard datasets. To this end, many advances have been focused on adapting a +general-purposed object detector with limited domain-specific designs. However, +designing a successful task-specific detector requires extraneous manual +experiments and parameter tuning through trial and error. In this paper, we +first propose and examine a fully-automatic pipeline to design a +fully-specialized detector (FSD) which mainly incorporates a +neural-architectural-searched model by exploring ideal network structures over +the backbone and task-specific head. On the DeepLesion dataset, extensive +results show that FSD can achieve 3.1 mAP gain while using approximately 40% +fewer parameters on binary lesion detection task and improved the mAP by around +10% on multi-type lesion detection task via our region-aware graph modeling +compared with existing general-purposed medical lesion detection networks. + +
+
+
+
+
+ + ♻ ☆ Learning Large Margin Sparse Embeddings for Open Set Medical Diagnosis + + +
+ Fueled by deep learning, computer-aided diagnosis achieves huge advances. +However, out of controlled lab environments, algorithms could face multiple +challenges. Open set recognition (OSR), as an important one, states that +categories unseen in training could appear in testing. In medical fields, it +could derive from incompletely collected training datasets and the constantly +emerging new or rare diseases. OSR requires an algorithm to not only correctly +classify known classes, but also recognize unknown classes and forward them to +experts for further diagnosis. To tackle OSR, we assume that known classes +could densely occupy small parts of the embedding space and the remaining +sparse regions could be recognized as unknowns. Following it, we propose Open +Margin Cosine Loss (OMCL) unifying two mechanisms. The former, called Margin +Loss with Adaptive Scale (MLAS), introduces angular margin for reinforcing +intra-class compactness and inter-class separability, together with an adaptive +scaling factor to strengthen the generalization capacity. The latter, called +Open-Space Suppression (OSS), opens the classifier by recognizing sparse +embedding space as unknowns using proposed feature space descriptors. Besides, +since medical OSR is still a nascent field, two publicly available benchmark +datasets are proposed for comparison. Extensive ablation studies and feature +visualization demonstrate the effectiveness of each design. Compared with +state-of-the-art methods, MLAS achieves superior performances, measured by ACC, +AUROC, and OSCR. + +
+
+
+
+
+ + ♻ ☆ Reverse Knowledge Distillation: Training a Large Model using a Small One + for Retinal Image Matching on Limited Data + + +
+ Retinal image matching plays a crucial role in monitoring disease progression +and treatment response. However, datasets with matched keypoints between +temporally separated pairs of images are not available in abundance to train +transformer-based model. We propose a novel approach based on reverse knowledge +distillation to train large models with limited data while preventing +overfitting. Firstly, we propose architectural modifications to a CNN-based +semi-supervised method called SuperRetina that help us improve its results on a +publicly available dataset. Then, we train a computationally heavier model +based on a vision transformer encoder using the lighter CNN-based model, which +is counter-intuitive in the field knowledge-distillation research where +training lighter models based on heavier ones is the norm. Surprisingly, such +reverse knowledge distillation improves generalization even further. Our +experiments suggest that high-dimensional fitting in representation space may +prevent overfitting unlike training directly to match the final output. We also +provide a public dataset with annotations for retinal image keypoint detection +and matching to help the research community develop algorithms for retinal +image applications. + +
+
+
+
+
+ + ♻ ☆ Exact Diffusion Inversion via Bi-directional Integration Approximation + + +
+ Recently, different methods have been proposed to address the inconsistency +issue of DDIM inversion to enable image editing, such as EDICT +\cite{Wallace23EDICT} and Null-text inversion \cite{Mokady23NullTestInv}. +However, the above methods introduce considerable computational overhead. In +this paper, we propose a new technique, named \emph{bi-directional integration +approximation} (BDIA), to perform exact diffusion inversion with neglible +computational overhead. Suppose we would like to estimate the next diffusion +state $\boldsymbol{z}_{i-1}$ at timestep $t_i$ with the historical information +$(i,\boldsymbol{z}_i)$ and $(i+1,\boldsymbol{z}_{i+1})$. We first obtain the +estimated Gaussian noise $\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i)$, and +then apply the DDIM update procedure twice for approximating the ODE +integration over the next time-slot $[t_i, t_{i-1}]$ in the forward manner and +the previous time-slot $[t_i, t_{t+1}]$ in the backward manner. The DDIM step +for the previous time-slot is used to refine the integration approximation made +earlier when computing $\boldsymbol{z}_i$. One nice property with BDIA-DDIM is +that the update expression for $\boldsymbol{z}_{i-1}$ is a linear combination +of $(\boldsymbol{z}_{i+1}, \boldsymbol{z}_i, +\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i))$. This allows for exact +backward computation of $\boldsymbol{z}_{i+1}$ given $(\boldsymbol{z}_i, +\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. Experiments +on both image reconstruction and image editing were conducted, confirming our +statement. BDIA can also be applied to improve the performance of other ODE +solvers in addition to DDIM. In our work, it is found that applying BDIA to the +EDM sampling procedure produces slightly better FID score over CIFAR10. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.11328 +
+
+
+
+
+ + ♻ ☆ Name Your Colour For the Task: Artificially Discover Colour Naming via + Colour Quantisation Transformer + + +
+ The long-standing theory that a colour-naming system evolves under dual +pressure of efficient communication and perceptual mechanism is supported by +more and more linguistic studies, including analysing four decades of +diachronic data from the Nafaanra language. This inspires us to explore whether +machine learning could evolve and discover a similar colour-naming system via +optimising the communication efficiency represented by high-level recognition +performance. Here, we propose a novel colour quantisation transformer, +CQFormer, that quantises colour space while maintaining the accuracy of machine +recognition on the quantised images. Given an RGB image, Annotation Branch maps +it into an index map before generating the quantised image with a colour +palette; meanwhile the Palette Branch utilises a key-point detection way to +find proper colours in the palette among the whole colour space. By interacting +with colour annotation, CQFormer is able to balance both the machine vision +accuracy and colour perceptual structure such as distinct and stable colour +distribution for discovered colour system. Very interestingly, we even observe +the consistent evolution pattern between our artificial colour system and basic +colour terms across human languages. Besides, our colour quantisation method +also offers an efficient quantisation method that effectively compresses the +image storage while maintaining high performance in high-level recognition +tasks such as classification and detection. Extensive experiments demonstrate +the superior performance of our method with extremely low bit-rate colours, +showing potential to integrate into quantisation network to quantities from +image to network activation. The source code is available at +https://github.com/ryeocthiv/CQFormer + +
+
+
+
+
+ + ♻ ☆ Deep Multiview Clustering by Contrasting Cluster Assignments + + +
+ Multiview clustering (MVC) aims to reveal the underlying structure of +multiview data by categorizing data samples into clusters. Deep learning-based +methods exhibit strong feature learning capabilities on large-scale datasets. +For most existing deep MVC methods, exploring the invariant representations of +multiple views is still an intractable problem. In this paper, we propose a +cross-view contrastive learning (CVCL) method that learns view-invariant +representations and produces clustering results by contrasting the cluster +assignments among multiple views. Specifically, we first employ deep +autoencoders to extract view-dependent features in the pretraining stage. Then, +a cluster-level CVCL strategy is presented to explore consistent semantic label +information among the multiple views in the fine-tuning stage. Thus, the +proposed CVCL method is able to produce more discriminative cluster assignments +by virtue of this learning strategy. Moreover, we provide a theoretical +analysis of soft cluster assignment alignment. Extensive experimental results +obtained on several datasets demonstrate that the proposed CVCL method +outperforms several state-of-the-art approaches. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Collaborative Perception in Autonomous Driving: Methods, Datasets and + Challenges + + +
+ Collaborative perception is essential to address occlusion and sensor failure +issues in autonomous driving. In recent years, theoretical and experimental +investigations of novel works for collaborative perception have increased +tremendously. So far, however, few reviews have focused on systematical +collaboration modules and large-scale collaborative perception datasets. This +work reviews recent achievements in this field to bridge this gap and motivate +future research. We start with a brief overview of collaboration schemes. After +that, we systematically summarize the collaborative perception methods for +ideal scenarios and real-world issues. The former focuses on collaboration +modules and efficiency, and the latter is devoted to addressing the problems in +actual application. Furthermore, we present large-scale public datasets and +summarize quantitative results on these benchmarks. Finally, we highlight gaps +and overlook challenges between current academic research and real-world +applications. The project page is +https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving + +
+
+ comment: 18 pages, 6 figures. Accepted by IEEE Intelligent Transportation + Systems Magazine. URL: + https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving +
+
+
+
+
+ + ♻ ☆ AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of + Diffusion Probabilistic Models + + +
+ Existing customization methods require access to multiple reference examples +to align pre-trained diffusion probabilistic models (DPMs) with user-provided +concepts. This paper aims to address the challenge of DPM customization when +the only available supervision is a differentiable metric defined on the +generated contents. Since the sampling procedure of DPMs involves recursive +calls to the denoising UNet, na\"ive gradient backpropagation requires storing +the intermediate states of all iterations, resulting in extremely high memory +consumption. To overcome this issue, we propose a novel method AdjointDPM, +which first generates new samples from diffusion models by solving the +corresponding probability-flow ODEs. It then uses the adjoint sensitivity +method to backpropagate the gradients of the loss to the models' parameters +(including conditioning signals, network weights, and initial noises) by +solving another augmented ODE. To reduce numerical errors in both the forward +generation and gradient backpropagation processes, we further reparameterize +the probability-flow ODE and augmented ODE as simple non-stiff ODEs using +exponential integration. Finally, we demonstrate the effectiveness of +AdjointDPM on three interesting tasks: converting visual effects into +identification text embeddings, finetuning DPMs for specific types of +stylization, and optimizing initial noise to generate adversarial samples for +security auditing. + +
+
+
+
+
+ + ♻ ☆ Invariant Slot Attention: Object Discovery with Slot-Centric Reference + Frames ICML 2023 + + +
+ Automatically discovering composable abstractions from raw perceptual data is +a long-standing challenge in machine learning. Recent slot-based neural +networks that learn about objects in a self-supervised manner have made +exciting progress in this direction. However, they typically fall short at +adequately capturing spatial symmetries present in the visual world, which +leads to sample inefficiency, such as when entangling object appearance and +pose. In this paper, we present a simple yet highly effective method for +incorporating spatial symmetries via slot-centric reference frames. We +incorporate equivariance to per-object pose transformations into the attention +and generation mechanism of Slot Attention by translating, scaling, and +rotating position encodings. These changes result in little computational +overhead, are easy to implement, and can result in large gains in terms of data +efficiency and overall improvements to object discovery. We evaluate our method +on a wide range of synthetic object discovery benchmarks namely CLEVR, +Tetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising +improvements on the challenging real-world Waymo Open dataset. + +
+
+ comment: Accepted at ICML 2023. Project page: https://invariantsa.github.io/ +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Alleviating the Long-Tail Problem in Conversational Recommender Systems + + +
+ Conversational recommender systems (CRS) aim to provide the recommendation +service via natural language conversations. To develop an effective CRS, +high-quality CRS datasets are very crucial. However, existing CRS datasets +suffer from the long-tail issue, \ie a large proportion of items are rarely (or +even never) mentioned in the conversations, which are called long-tail items. +As a result, the CRSs trained on these datasets tend to recommend frequent +items, and the diversity of the recommended items would be largely reduced, +making users easier to get bored. + To address this issue, this paper presents \textbf{LOT-CRS}, a novel +framework that focuses on simulating and utilizing a balanced CRS dataset (\ie +covering all the items evenly) for improving \textbf{LO}ng-\textbf{T}ail +recommendation performance of CRSs. In our approach, we design two pre-training +tasks to enhance the understanding of simulated conversation for long-tail +items, and adopt retrieval-augmented fine-tuning with label smoothness strategy +to further improve the recommendation of long-tail items. Extensive experiments +on two public CRS datasets have demonstrated the effectiveness and +extensibility of our approach, especially on long-tail recommendation. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Identifying document similarity using a fast estimation of the + Levenshtein Distance based on compression and signatures + + +
+ Identifying document similarity has many applications, e.g., source code +analysis or plagiarism detection. However, identifying similarities is not +trivial and can be time complex. For instance, the Levenshtein Distance is a +common metric to define the similarity between two documents but has quadratic +runtime which makes it impractical for large documents where large starts with +a few hundred kilobytes. In this paper, we present a novel concept that allows +estimating the Levenshtein Distance: the algorithm first compresses documents +to signatures (similar to hash values) using a user-defined compression ratio. +Signatures can then be compared against each other (some constrains apply) +where the outcome is the estimated Levenshtein Distance. Our evaluation shows +promising results in terms of runtime efficiency and accuracy. In addition, we +introduce a significance score allowing examiners to set a threshold and +identify related documents. + +
+
+ comment: In: Proceedings of the Digital Forensics Research Conference Europe + (DFRWS EU). 2022 +
+
+
+
+
+ + ☆ Analysis of Elephant Movement in Sub-Saharan Africa: Ecological, + Climatic, and Conservation Perspectives + + +
+ The interaction between elephants and their environment has profound +implications for both ecology and conservation strategies. This study presents +an analytical approach to decipher the intricate patterns of elephant movement +in Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal +variations and rainfall patterns. Despite the complexities surrounding these +influential factors, our analysis provides a holistic view of elephant +migratory behavior in the context of the dynamic African landscape. Our +comprehensive approach enables us to predict the potential impact of these +ecological determinants on elephant migration, a critical step in establishing +informed conservation strategies. This projection is particularly crucial given +the impacts of global climate change on seasonal and rainfall patterns, which +could substantially influence elephant movements in the future. The findings of +our work aim to not only advance the understanding of movement ecology but also +foster a sustainable coexistence of humans and elephants in Sub-Saharan Africa. +By predicting potential elephant routes, our work can inform strategies to +minimize human-elephant conflict, effectively manage land use, and enhance +anti-poaching efforts. This research underscores the importance of integrating +movement ecology and climatic variables for effective wildlife management and +conservation planning. + +
+
+ comment: 11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on + Computing and Sustainable Societies (COMPASS 2023) +
+
+
+
+
+ + ☆ MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through + Multi-Answer Open-Domain Question Answering SIGIR 2023 + + +
+ Check-worthy claim detection aims at providing plausible misinformation to +downstream fact-checking systems or human experts to check. This is a crucial +step toward accelerating the fact-checking process. Many efforts have been put +into how to identify check-worthy claims from a small scale of pre-collected +claims, but how to efficiently detect check-worthy claims directly from a +large-scale information source, such as Twitter, remains underexplored. To fill +this gap, we introduce MythQA, a new multi-answer open-domain question +answering(QA) task that involves contradictory stance mining for query-based +large-scale check-worthy claim detection. The idea behind this is that +contradictory claims are a strong indicator of misinformation that merits +scrutiny by the appropriate authorities. To study this task, we construct +TweetMythQA, an evaluation dataset containing 522 factoid multi-answer +questions based on controversial topics. Each question is annotated with +multiple answers. Moreover, we collect relevant tweets for each distinct +answer, then classify them into three categories: "Supporting", "Refuting", and +"Neutral". In total, we annotated 5.3K tweets. Contradictory evidence is +collected for all answers in the dataset. Finally, we present a baseline system +for MythQA and evaluate existing NLP models for each system component using the +TweetMythQA dataset. We provide initial benchmarks and identify key challenges +for future models to improve upon. Code and data are available at: +https://github.com/TonyBY/Myth-QA + +
+
+ comment: Accepted by SIGIR 2023 +
+
+
+
+
+ + ♻ ☆ Going Beyond Local: Global Graph-Enhanced Personalized News + Recommendations + + +
+ Precisely recommending candidate news articles to users has always been a +core challenge for personalized news recommendation systems. Most recent works +primarily focus on using advanced natural language processing techniques to +extract semantic information from rich textual data, employing content-based +methods derived from local historical news. However, this approach lacks a +global perspective, failing to account for users' hidden motivations and +behaviors beyond semantic information. To address this challenge, we propose a +novel model called GLORY (Global-LOcal news Recommendation sYstem), which +combines global representations learned from other users with local +representations to enhance personalized recommendation systems. We accomplish +this by constructing a Global-aware Historical News Encoder, which includes a +global news graph and employs gated graph neural networks to enrich news +representations, thereby fusing historical news representations by a historical +news aggregator. Similarly, we extend this approach to a Global Candidate News +Encoder, utilizing a global entity graph and a candidate news aggregator to +enhance candidate news representation. Evaluation results on two public news +datasets demonstrate that our method outperforms existing approaches. +Furthermore, our model offers more diverse recommendations. + +
+
+ comment: 10 pages, Recsys 2023 +
+
+
+
+
+ + ♻ ☆ Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques + + +
+ In the contemporary digital landscape, online reviews have become an +indispensable tool for promoting products and services across various +businesses. Marketers, advertisers, and online businesses have found incentives +to create deceptive positive reviews for their products and negative reviews +for their competitors' offerings. As a result, the writing of deceptive reviews +has become an unavoidable practice for businesses seeking to promote themselves +or undermine their rivals. Detecting such deceptive reviews has become an +intense and ongoing area of research. This research paper proposes a machine +learning model to identify deceptive reviews, with a particular focus on +restaurants. This study delves into the performance of numerous experiments +conducted on a dataset of restaurant reviews known as the Deceptive Opinion +Spam Corpus. To accomplish this, an n-gram model and max features are developed +to effectively identify deceptive content, particularly focusing on fake +reviews. A benchmark study is undertaken to explore the performance of two +different feature extraction techniques, which are then coupled with five +distinct machine learning classification algorithms. The experimental results +reveal that the passive aggressive classifier stands out among the various +algorithms, showcasing the highest accuracy not only in text classification but +also in identifying fake reviews. Moreover, the research delves into data +augmentation and implements various deep learning techniques to further enhance +the process of detecting deceptive reviews. The findings shed light on the +efficacy of the proposed machine learning approach and offer valuable insights +into dealing with deceptive reviews in the realm of online businesses. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Large Language Model Augmented Narrative Driven Recommendations RecSys 2023 + + +
+ Narrative-driven recommendation (NDR) presents an information access problem +where users solicit recommendations with verbose descriptions of their +preferences and context, for example, travelers soliciting recommendations for +points of interest while describing their likes/dislikes and travel +circumstances. These requests are increasingly important with the rise of +natural language-based conversational interfaces for search and recommendation +systems. However, NDR lacks abundant training data for models, and current +platforms commonly do not support these requests. Fortunately, classical +user-item interaction datasets contain rich textual data, e.g., reviews, which +often describe user preferences and context - this may be used to bootstrap +training for NDR models. In this work, we explore using large language models +(LLMs) for data augmentation to train NDR models. We use LLMs for authoring +synthetic narrative queries from user-item interactions with few-shot prompting +and train retrieval models for NDR on synthetic queries and user-item +interaction data. Our experiments demonstrate that this is an effective +strategy for training small-parameter retrieval models that outperform other +retrieval and LLM baselines for narrative-driven recommendation. + +
+
+ comment: RecSys 2023 Camera-ready +
+
+
+
+
+ + ♻ ☆ Editable User Profiles for Controllable Text Recommendation SIGIR-2023 + + +
+ Methods for making high-quality recommendations often rely on learning latent +representations from interaction data. These methods, while performant, do not +provide ready mechanisms for users to control the recommendation they receive. +Our work tackles this problem by proposing LACE, a novel concept value +bottleneck model for controllable text recommendations. LACE represents each +user with a succinct set of human-readable concepts through retrieval given +user-interacted documents and learns personalized representations of the +concepts based on user documents. This concept based user profile is then +leveraged to make recommendations. The design of our model affords control over +the recommendations through a number of intuitive interactions with a +transparent user profile. We first establish the quality of recommendations +obtained from LACE in an offline evaluation on three recommendation tasks +spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we +validate the controllability of LACE under simulated user interactions. +Finally, we implement LACE in an interactive controllable recommender system +and conduct a user study to demonstrate that users are able to improve the +quality of recommendations they receive through interactions with an editable +user profile. + +
+
+ comment: SIGIR-2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ Fast Approximate Nearest Neighbor Search with a Dynamic Exploration + Graph using Continuous Refinement + + +
+ For approximate nearest neighbor search, graph-based algorithms have shown to +offer the best trade-off between accuracy and search time. We propose the +Dynamic Exploration Graph (DEG) which significantly outperforms existing +algorithms in terms of search and exploration efficiency by combining two new +ideas: First, a single undirected even regular graph is incrementally built by +partially replacing existing edges to integrate new vertices and to update old +neighborhoods at the same time. Secondly, an edge optimization algorithm is +used to continuously improve the quality of the graph. Combining this ongoing +refinement with the graph construction process leads to a well-organized graph +structure at all times, resulting in: (1) increased search efficiency, (2) +predictable index size, (3) guaranteed connectivity and therefore reachability +of all vertices, and (4) a dynamic graph structure. In addition we investigate +how well existing graph-based search systems can handle indexed queries where +the seed vertex of a search is the query itself. Such exploration tasks, +despite their good starting point, are not necessarily easy. High efficiency in +approximate nearest neighbor search (ANNS) does not automatically imply good +performance in exploratory search. Extensive experiments show that our new +Dynamic Exploration Graph outperforms existing algorithms significantly for +indexed and unindexed queries. + +
+
+
+
+
+
+
+
+ + Machine Learning 97 + +
+
+
+ + ☆ Differentially Private Heavy Hitter Detection using Federated Analytics + + +
+ In this work, we study practical heuristics to improve the performance of +prefix-tree based algorithms for differentially private heavy hitter detection. +Our model assumes each user has multiple data points and the goal is to learn +as many of the most frequent data points as possible across all users' data +with aggregate and local differential privacy. We propose an adaptive +hyperparameter tuning algorithm that improves the performance of the algorithm +while satisfying computational, communication and privacy constraints. We +explore the impact of different data-selection schemes as well as the impact of +introducing deny lists during multiple runs of the algorithm. We test these +improvements using extensive experimentation on the Reddit +dataset~\cite{caldas2018leaf} on the task of learning the most frequent words. + +
+
+
+
+
+ + ☆ Advancing Ad Auction Realism: Practical Insights & Modeling Implications + + +
+ This paper proposes a learning model of online ad auctions that allows for +the following four key realistic characteristics of contemporary online +auctions: (1) ad slots can have different values and click-through rates +depending on users' search queries, (2) the number and identity of competing +advertisers are unobserved and change with each auction, (3) advertisers only +receive partial, aggregated feedback, and (4) payment rules are only partially +specified. We model advertisers as agents governed by an adversarial bandit +algorithm, independent of auction mechanism intricacies. Our objective is to +simulate the behavior of advertisers for counterfactual analysis, prediction, +and inference purposes. Our findings reveal that, in such richer environments, +"soft floors" can enhance key performance metrics even when bidders are drawn +from the same population. We further demonstrate how to infer advertiser value +distributions from observed bids, thereby affirming the practical efficacy of +our approach even in a more realistic auction setting. + +
+
+
+
+
+ + ☆ Mitigating Communications Threats in Decentralized Federated Learning + through Moving Target Defense + + +
+ The rise of Decentralized Federated Learning (DFL) has enabled the training +of machine learning models across federated participants, fostering +decentralized model aggregation and reducing dependence on a server. However, +this approach introduces unique communication security challenges that have yet +to be thoroughly addressed in the literature. These challenges primarily +originate from the decentralized nature of the aggregation process, the varied +roles and responsibilities of the participants, and the absence of a central +authority to oversee and mitigate threats. Addressing these challenges, this +paper first delineates a comprehensive threat model, highlighting the potential +risks of DFL communications. In response to these identified risks, this work +introduces a security module designed for DFL platforms to counter +communication-based attacks. The module combines security techniques such as +symmetric and asymmetric encryption with Moving Target Defense (MTD) +techniques, including random neighbor selection and IP/port switching. The +security module is implemented in a DFL platform called Fedstellar, allowing +the deployment and monitoring of the federation. A DFL scenario has been +deployed, involving eight physical devices implementing three security +configurations: (i) a baseline with no security, (ii) an encrypted +configuration, and (iii) a configuration integrating both encryption and MTD +techniques. The effectiveness of the security module is validated through +experiments with the MNIST dataset and eclipse attacks. The results indicated +an average F1 score of 95%, with moderate increases in CPU usage (up to 63.2% ++-3.5%) and network traffic (230 MB +-15 MB) under the most secure +configuration, mitigating the risks posed by eavesdropping or eclipse attacks. + +
+
+
+
+
+ + ☆ Convergence of SGD for Training Neural Networks with Sliced Wasserstein + Losses + + +
+ Optimal Transport has sparked vivid interest in recent years, in particular +thanks to the Wasserstein distance, which provides a geometrically sensible and +intuitive way of comparing probability measures. For computational reasons, the +Sliced Wasserstein (SW) distance was introduced as an alternative to the +Wasserstein distance, and has seen uses for training generative Neural Networks +(NNs). While convergence of Stochastic Gradient Descent (SGD) has been observed +practically in such a setting, there is to our knowledge no theoretical +guarantee for this observation. Leveraging recent works on convergence of SGD +on non-smooth and non-convex functions by Bianchi et al. (2022), we aim to +bridge that knowledge gap, and provide a realistic context under which +fixed-step SGD trajectories for the SW loss on NN parameters converge. More +precisely, we show that the trajectories approach the set of (sub)-gradient +flow equations as the step decreases. Under stricter assumptions, we show a +much stronger convergence result for noised and projected SGD schemes, namely +that the long-run limits of the trajectories approach a set of generalised +critical points of the loss function. + +
+
+
+
+
+ + ☆ JoinGym: An Efficient Query Optimization Environment for Reinforcement + Learning + + +
+ In this paper, we present \textsc{JoinGym}, an efficient and lightweight +query optimization environment for reinforcement learning (RL). Join order +selection (JOS) is a classic NP-hard combinatorial optimization problem from +database query optimization and can serve as a practical testbed for the +generalization capabilities of RL algorithms. We describe how to formulate each +of the left-deep and bushy variants of the JOS problem as a Markov Decision +Process (MDP), and we provide an implementation adhering to the standard +Gymnasium API. We highlight that our implementation \textsc{JoinGym} is +completely based on offline traces of all possible joins, which enables RL +practitioners to easily and quickly test their methods on a realistic data +management problem without needing to setup any systems. Moreover, we also +provide all possible join traces on $3300$ novel SQL queries generated from the +IMDB dataset. Upon benchmarking popular RL algorithms, we find that at least +one method can obtain near-optimal performance on train-set queries but their +performance degrades by several orders of magnitude on test-set queries. This +gap motivates further research for RL algorithms that generalize well in +multi-task combinatorial optimization problems. + +
+
+ comment: We will make all the queries available soon +
+
+
+
+
+ + ☆ Using simulation to calibrate real data acquisition in veterinary + medicine + + +
+ This paper explores the innovative use of simulation environments to enhance +data acquisition and diagnostics in veterinary medicine, focusing specifically +on gait analysis in dogs. The study harnesses the power of Blender and the +Blenderproc library to generate synthetic datasets that reflect diverse +anatomical, environmental, and behavioral conditions. The generated data, +represented in graph form and standardized for optimal analysis, is utilized to +train machine learning algorithms for identifying normal and abnormal gaits. +Two distinct datasets with varying degrees of camera angle granularity are +created to further investigate the influence of camera perspective on model +accuracy. Preliminary results suggest that this simulation-based approach holds +promise for advancing veterinary diagnostics by enabling more precise data +acquisition and more effective machine learning models. By integrating +synthetic and real-world patient data, the study lays a robust foundation for +improving overall effectiveness and efficiency in veterinary medicine. + +
+
+
+
+
+ + ☆ Fast Adaptive Test-Time Defense with Robust Features + + +
+ Adaptive test-time defenses are used to improve the robustness of deep neural +networks to adversarial examples. However, existing methods significantly +increase the inference time due to additional optimization on the model +parameters or the input at test time. In this work, we propose a novel adaptive +test-time defense strategy that is easy to integrate with any existing (robust) +training procedure without additional test-time computation. Based on the +notion of robustness of features that we present, the key idea is to project +the trained models to the most robust feature space, thereby reducing the +vulnerability to adversarial attacks in non-robust directions. We theoretically +show that the top eigenspace of the feature matrix are more robust for a +generalized additive model and support our argument for a large width neural +network with the Neural Tangent Kernel (NTK) equivalence. We conduct extensive +experiments on CIFAR-10 and CIFAR-100 datasets for several robustness +benchmarks, including the state-of-the-art methods in RobustBench, and observe +that the proposed method outperforms existing adaptive test-time defenses at +much lower computation costs. + +
+
+
+
+
+ + ☆ An Efficient Interior-Point Method for Online Convex Optimization + + +
+ A new algorithm for regret minimization in online convex optimization is +described. The regret of the algorithm after $T$ time periods is $O(\sqrt{T +\log T})$ - which is the minimum possible up to a logarithmic term. In +addition, the new algorithm is adaptive, in the sense that the regret bounds +hold not only for the time periods $1,\ldots,T$ but also for every sub-interval +$s,s+1,\ldots,t$. The running time of the algorithm matches that of newly +introduced interior point algorithms for regret minimization: in +$n$-dimensional space, during each iteration the new algorithm essentially +solves a system of linear equations of order $n$, rather than solving some +constrained convex optimization problem in $n$ dimensions and possibly many +constraints. + +
+
+
+
+
+ + ☆ Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts + + +
+ Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have +revolutionized visual representation learning by providing good performance on +downstream datasets. VLMs are 0-shot adapted to a downstream dataset by +designing prompts that are relevant to the dataset. Such prompt engineering +makes use of domain expertise and a validation dataset. Meanwhile, recent +developments in generative pretrained models like GPT-4 mean they can be used +as advanced internet search tools. They can also be manipulated to provide +visual information in any structure. In this work, we show that GPT-4 can be +used to generate text that is visually descriptive and how this can be used to +adapt CLIP to downstream tasks. We show considerable improvements in 0-shot +transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD +(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt. +We also design a simple few-shot adapter that learns to choose the best +possible sentences to construct generalizable classifiers that outperform the +recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized +fine-grained datasets. We will release the code, prompts, and auxiliary text +dataset upon acceptance. + +
+
+ comment: 10 pages, Pre-print +
+
+
+
+
+ + ☆ Bandits with Deterministically Evolving States + + +
+ We propose a model for learning with bandit feedback while accounting for +deterministically evolving and unobservable states that we call Bandits with +Deterministically Evolving States. The workhorse applications of our model are +learning for recommendation systems and learning for online ads. In both cases, +the reward that the algorithm obtains at each round is a function of the +short-term reward of the action chosen and how ``healthy'' the system is (i.e., +as measured by its state). For example, in recommendation systems, the reward +that the platform obtains from a user's engagement with a particular type of +content depends not only on the inherent features of the specific content, but +also on how the user's preferences have evolved as a result of interacting with +other types of content on the platform. Our general model accounts for the +different rate $\lambda \in [0,1]$ at which the state evolves (e.g., how fast a +user's preferences shift as a result of previous content consumption) and +encompasses standard multi-armed bandits as a special case. The goal of the +algorithm is to minimize a notion of regret against the best-fixed sequence of +arms pulled. We analyze online learning algorithms for any possible +parametrization of the evolution rate $\lambda$. Specifically, the regret rates +obtained are: for $\lambda \in [0, 1/T^2]$: $\widetilde O(\sqrt{KT})$; for +$\lambda = T^{-a/b}$ with $b < a < 2b$: $\widetilde O (T^{b/a})$; for $\lambda +\in (1/T, 1 - 1/\sqrt{T}): \widetilde O (K^{1/3}T^{2/3})$; and for $\lambda \in +[1 - 1/\sqrt{T}, 1]: \widetilde O (K\sqrt{T})$. + +
+
+
+
+
+ + ☆ Scalable Multi-agent Skill Discovery based on Kronecker Graphs NeurIPS 2022 + + +
+ Covering skill (a.k.a., option) discovery has been developed to improve the +exploration of RL in single-agent scenarios with sparse reward signals, through +connecting the most distant states in the embedding space provided by the +Fiedler vector of the state transition graph. Given that joint state space +grows exponentially with the number of agents in multi-agent systems, existing +researches still relying on single-agent option discovery either become +prohibitive or fail to directly discover joint options that improve the +connectivity of the joint state space. In this paper, we show how to directly +compute multi-agent options with collaborative exploratory behaviors while +still enjoying the ease of decomposition. Our key idea is to approximate the +joint state space as a Kronecker graph, based on which we can directly estimate +its Fiedler vector using the Laplacian spectrum of individual agents' +transition graphs. Further, considering that directly computing the Laplacian +spectrum is intractable for tasks with infinite-scale state spaces, we further +propose a deep learning extension of our method by estimating eigenfunctions +through NN-based representation learning techniques. The evaluation on +multi-agent tasks built with simulators like Mujoco, shows that the proposed +algorithm can successfully identify multi-agent options, and significantly +outperforms the state-of-the-art. Codes are available at: +https://github.itap.purdue.edu/Clan-labs/Scalable_MAOD_via_KP. + +
+
+ comment: Accepted to NeurIPS 2022. arXiv admin note: substantial text overlap + with arXiv:2201.08227 +
+
+
+
+
+ + ☆ Offline Multi-Agent Reinforcement Learning with Implicit Global-to-Local + Value Regularization + + +
+ Offline reinforcement learning (RL) has received considerable attention in +recent years due to its attractive capability of learning policies from offline +datasets without environmental interactions. Despite some success in the +single-agent setting, offline multi-agent RL (MARL) remains to be a challenge. +The large joint state-action space and the coupled multi-agent behaviors pose +extra complexities for offline policy optimization. Most existing offline MARL +studies simply apply offline data-related regularizations on individual agents, +without fully considering the multi-agent system at the global level. In this +work, we present OMIGA, a new offline m ulti-agent RL algorithm with implicit +global-to-local v alue regularization. OMIGA provides a principled framework to +convert global-level value regularization into equivalent implicit local value +regularizations and simultaneously enables in-sample learning, thus elegantly +bridging multi-agent value decomposition and policy learning with offline +regularizations. Based on comprehensive experiments on the offline multi-agent +MuJoCo and StarCraft II micro-management tasks, we show that OMIGA achieves +superior performance over the state-of-the-art offline MARL methods in almost +all tasks. + +
+
+
+
+
+ + ☆ Robust Fully-Asynchronous Methods for Distributed Training over General + Architecture + + +
+ Perfect synchronization in distributed machine learning problems is +inefficient and even impossible due to the existence of latency, package losses +and stragglers. We propose a Robust Fully-Asynchronous Stochastic Gradient +Tracking method (R-FAST), where each device performs local computation and +communication at its own pace without any form of synchronization. Different +from existing asynchronous distributed algorithms, R-FAST can eliminate the +impact of data heterogeneity across devices and allow for packet losses by +employing a robust gradient tracking strategy that relies on properly designed +auxiliary variables for tracking and buffering the overall gradient vector. +More importantly, the proposed method utilizes two spanning-tree graphs for +communication so long as both share at least one common root, enabling flexible +designs in communication architectures. We show that R-FAST converges in +expectation to a neighborhood of the optimum with a geometric rate for smooth +and strongly convex objectives; and to a stationary point with a sublinear rate +for general non-convex settings. Extensive experiments demonstrate that R-FAST +runs 1.5-2 times faster than synchronous benchmark algorithms, such as +Ring-AllReduce and D-PSGD, while still achieving comparable accuracy, and +outperforms existing asynchronous SOTA algorithms, such as AD-PSGD and OSGP, +especially in the presence of stragglers. + +
+
+
+
+
+ + ☆ Persistent Ballistic Entanglement Spreading with Optimal Control in + Quantum Spin Chains + + +
+ Entanglement propagation provides a key routine to understand quantum +many-body dynamics in and out of equilibrium. In this work, we uncover that the +``variational entanglement-enhancing'' field (VEEF) robustly induces a +persistent ballistic spreading of entanglement in quantum spin chains. The VEEF +is time dependent, and is optimally controlled to maximize the bipartite +entanglement entropy (EE) of the final state. Such a linear growth persists +till the EE reaches the genuine saturation $\tilde{S} = - \log_{2} +2^{-\frac{N}{2}}=\frac{N}{2}$ with $N$ the total number of spins. The EE +satisfies $S(t) = v t$ for the time $t \leq \frac{N}{2v}$, with $v$ the +velocity. These results are in sharp contrast with the behaviors without VEEF, +where the EE generally approaches a sub-saturation known as the Page value +$\tilde{S}_{P} =\tilde{S} - \frac{1}{2\ln{2}}$ in the long-time limit, and the +entanglement growth deviates from being linear before the Page value is +reached. The dependence between the velocity and interactions is explored, with +$v \simeq 2.76$, $4.98$, and $5.75$ for the spin chains with Ising, XY, and +Heisenberg interactions, respectively. We further show that the nonlinear +growth of EE emerges with the presence of long-range interactions. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Learning minimal representations of stochastic processes with + variational autoencoders SP + + +
+ Stochastic processes have found numerous applications in science, as they are +broadly used to model a variety of natural phenomena. Due to their intrinsic +randomness and uncertainty, they are however difficult to characterize. Here, +we introduce an unsupervised machine learning approach to determine the minimal +set of parameters required to effectively describe the dynamics of a stochastic +process. Our method builds upon an extended $\beta$-variational autoencoder +architecture. By means of simulated datasets corresponding to paradigmatic +diffusion models, we showcase its effectiveness in extracting the minimal +relevant parameters that accurately describe these dynamics. Furthermore, the +method enables the generation of new trajectories that faithfully replicate the +expected stochastic behavior. Overall, our approach enables for the autonomous +discovery of unknown parameters describing stochastic processes, hence +enhancing our comprehension of complex phenomena across various fields. + +
+
+ comment: 9 pages, 5 figures, 1 table. Code available at + https://github.com/GabrielFernandezFernandez/SPIVAE +
+
+
+
+
+ + ☆ Finding Optimal Diverse Feature Sets with Alternative Feature Selection + + +
+ Feature selection is popular for obtaining small, interpretable, yet highly +accurate prediction models. Conventional feature-selection methods typically +yield one feature set only, which might not suffice in some scenarios. For +example, users might be interested in finding alternative feature sets with +similar prediction quality, offering different explanations of the data. In +this article, we introduce alternative feature selection and formalize it as an +optimization problem. In particular, we define alternatives via constraints and +enable users to control the number and dissimilarity of alternatives. Next, we +analyze the complexity of this optimization problem and show NP-hardness. +Further, we discuss how to integrate conventional feature-selection methods as +objectives. Finally, we evaluate alternative feature selection with 30 +classification datasets. We observe that alternative feature sets may indeed +have high prediction quality, and we analyze several factors influencing this +outcome. + +
+
+
+
+
+ + ☆ Transferability of Convolutional Neural Networks in Stationary Learning + Tasks + + +
+ Recent advances in hardware and big data acquisition have accelerated the +development of deep learning techniques. For an extended period of time, +increasing the model complexity has led to performance improvements for various +tasks. However, this trend is becoming unsustainable and there is a need for +alternative, computationally lighter methods. In this paper, we introduce a +novel framework for efficient training of convolutional neural networks (CNNs) +for large-scale spatial problems. To accomplish this we investigate the +properties of CNNs for tasks where the underlying signals are stationary. We +show that a CNN trained on small windows of such signals achieves a nearly +performance on much larger windows without retraining. This claim is supported +by our theoretical analysis, which provides a bound on the performance +degradation. Additionally, we conduct thorough experimental analysis on two +tasks: multi-target tracking and mobile infrastructure on demand. Our results +show that the CNN is able to tackle problems with many hundreds of agents after +being trained with fewer than ten. Thus, CNN architectures provide solutions to +these problems at previously computationally intractable scales. + +
+
+ comment: 14 pages, 7 figures, for associated code see + https://github.com/damowerko/mtt +
+
+
+
+
+ + ☆ A Change of Heart: Improving Speech Emotion Recognition through + Speech-to-Text Modality Conversion + + +
+ Speech Emotion Recognition (SER) is a challenging task. In this paper, we +introduce a modality conversion concept aimed at enhancing emotion recognition +performance on the MELD dataset. We assess our approach through two +experiments: first, a method named Modality-Conversion that employs automatic +speech recognition (ASR) systems, followed by a text classifier; second, we +assume perfect ASR output and investigate the impact of modality conversion on +SER, this method is called Modality-Conversion++. Our findings indicate that +the first method yields substantial results, while the second method +outperforms state-of-the-art (SOTA) speech-based approaches in terms of SER +weighted-F1 (WF1) score on the MELD dataset. This research highlights the +potential of modality conversion for tasks that can be conducted in alternative +modalities. + +
+
+
+
+
+ + ☆ FMT: Removing Backdoor Feature Maps via Feature Map Testing in Deep + Neural Networks + + +
+ Deep neural networks have been widely used in many critical applications, +such as autonomous vehicles and medical diagnosis. However, their security is +threatened by backdoor attack, which is achieved by adding artificial patterns +to specific training data. Existing defense strategies primarily focus on using +reverse engineering to reproduce the backdoor trigger generated by attackers +and subsequently repair the DNN model by adding the trigger into inputs and +fine-tuning the model with ground-truth labels. However, once the trigger +generated by the attackers is complex and invisible, the defender can not +successfully reproduce the trigger. Consequently, the DNN model will not be +repaired since the trigger is not effectively removed. + In this work, we propose Feature Map Testing~(FMT). Different from existing +defense strategies, which focus on reproducing backdoor triggers, FMT tries to +detect the backdoor feature maps, which are trained to extract backdoor +information from the inputs. After detecting these backdoor feature maps, FMT +will erase them and then fine-tune the model with a secure subset of training +data. Our experiments demonstrate that, compared to existing defense +strategies, FMT can effectively reduce the Attack Success Rate (ASR) even +against the most complex and invisible attack triggers. Second, unlike +conventional defense methods that tend to exhibit low Robust Accuracy (i.e., +the model's accuracy on the poisoned data), FMT achieves higher RA, indicating +its superiority in maintaining model performance while mitigating the effects +of backdoor attacks~(e.g., FMT obtains 87.40\% RA in CIFAR10). Third, compared +to existing feature map pruning techniques, FMT can cover more backdoor feature +maps~(e.g., FMT removes 83.33\% of backdoor feature maps from the model in the +CIFAR10 \& BadNet scenario). + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ A multi-modal representation of El Niño Southern Oscillation Diversity + + +
+ The El Ni\~no-Southern Oscillation (ENSO) is characterized by alternating +periods of warm (El Ni\~no) and cold (La Ni\~na) sea surface temperature +anomalies (SSTA) in the equatorial Pacific. Although El Ni\~no and La Ni\~na +are well-defined climate patterns, no two events are alike. To date, ENSO +diversity has been described primarily in terms of the longitudinal location of +peak SSTA, used to define a bimodal classification of events in Eastern Pacific +(EP) and Central Pacific (CP) types. Here, we use low-dimensional +representations of Pacific SSTAs to argue that binary categorical memberships +are unsuitable to describe ENSO events. Using fuzzy unsupervised clustering, we +recover the four known ENSO categories, along with a fifth category: an Extreme +El Ni\~no. We show that Extreme El Ni\~nos differ both in their intensity and +temporal evolution from canonical EP El Ni\~nos. We also find that CP La +Ni\~nas, EP El Ni\~nos, and Extreme El Ni\~nos contribute the most to +interdecadal ENSO variability. + +
+
+
+
+
+ + ☆ Towards practical reinforcement learning for tokamak magnetic control + + +
+ Reinforcement learning (RL) has shown promising results for real-time control +systems, including the domain of plasma magnetic control. However, there are +still significant drawbacks compared to traditional feedback control approaches +for magnetic confinement. In this work, we address key drawbacks of the RL +method; achieving higher control accuracy for desired plasma properties, +reducing the steady-state error, and decreasing the required time to learn new +tasks. We build on top of \cite{degrave2022magnetic}, and present algorithmic +improvements to the agent architecture and training procedure. We present +simulation results that show up to 65\% improvement in shape accuracy, achieve +substantial reduction in the long-term bias of the plasma current, and +additionally reduce the training time required to learn new tasks by a factor +of 3 or more. We present new experiments using the upgraded RL-based +controllers on the TCV tokamak, which validate the simulation results achieved, +and point the way towards routinely achieving accurate discharges using the RL +approach. + +
+
+
+
+
+ + ☆ Training Latency Minimization for Model-Splitting Allowed Federated Edge + Learning + + +
+ To alleviate the shortage of computing power faced by clients in training +deep neural networks (DNNs) using federated learning (FL), we leverage the edge +computing and split learning to propose a model-splitting allowed FL (SFL) +framework, with the aim to minimize the training latency without loss of test +accuracy. Under the synchronized global update setting, the latency to complete +a round of global training is determined by the maximum latency for the clients +to complete a local training session. Therefore, the training latency +minimization problem (TLMP) is modelled as a minimizing-maximum problem. To +solve this mixed integer nonlinear programming problem, we first propose a +regression method to fit the quantitative-relationship between the cut-layer +and other parameters of an AI-model, and thus, transform the TLMP into a +continuous problem. Considering that the two subproblems involved in the TLMP, +namely, the cut-layer selection problem for the clients and the computing +resource allocation problem for the parameter-server are relative independence, +an alternate-optimization-based algorithm with polynomial time complexity is +developed to obtain a high-quality solution to the TLMP. Extensive experiments +are performed on a popular DNN-model EfficientNetV2 using dataset MNIST, and +the results verify the validity and improved performance of the proposed SFL +framework. + +
+
+
+
+
+ + ☆ General regularization in covariate shift adaptation + + +
+ Sample reweighting is one of the most widely used methods for correcting the +error of least squares learning algorithms in reproducing kernel Hilbert spaces +(RKHS), that is caused by future data distributions that are different from the +training data distribution. In practical situations, the sample weights are +determined by values of the estimated Radon-Nikod\'ym derivative, of the future +data distribution w.r.t.~the training data distribution. In this work, we +review known error bounds for reweighted kernel regression in RKHS and obtain, +by combination, novel results. We show under weak smoothness conditions, that +the amount of samples, needed to achieve the same order of accuracy as in the +standard supervised learning without differences in data distributions, is +smaller than proven by state-of-the-art analyses. + +
+
+
+
+
+ + ☆ Predict, Refine, Synthesize: Self-Guiding Diffusion Models for + Probabilistic Time Series Forecasting + + +
+ Diffusion models have achieved state-of-the-art performance in generative +modeling tasks across various domains. Prior works on time series diffusion +models have primarily focused on developing conditional models tailored to +specific forecasting or imputation tasks. In this work, we explore the +potential of task-agnostic, unconditional diffusion models for several time +series applications. We propose TSDiff, an unconditionally trained diffusion +model for time series. Our proposed self-guidance mechanism enables +conditioning TSDiff for downstream tasks during inference, without requiring +auxiliary networks or altering the training procedure. We demonstrate the +effectiveness of our method on three different time series tasks: forecasting, +refinement, and synthetic data generation. First, we show that TSDiff is +competitive with several task-specific conditional forecasting methods +(predict). Second, we leverage the learned implicit probability density of +TSDiff to iteratively refine the predictions of base forecasters with reduced +computational overhead over reverse diffusion (refine). Notably, the generative +performance of the model remains intact -- downstream forecasters trained on +synthetic samples from TSDiff outperform forecasters that are trained on +samples from other state-of-the-art generative time series models, occasionally +even outperforming models trained on real data (synthesize). + +
+
+
+
+
+ + ☆ A New Deep State-Space Analysis Framework for Patient Latent State + Estimation and Classification from EHR Time Series Data + + +
+ Many diseases, including cancer and chronic conditions, require extended +treatment periods and long-term strategies. Machine learning and AI research +focusing on electronic health records (EHRs) have emerged to address this need. +Effective treatment strategies involve more than capturing sequential changes +in patient test values. It requires an explainable and clinically interpretable +model by capturing the patient's internal state over time. + In this study, we propose the "deep state-space analysis framework," using +time-series unsupervised learning of EHRs with a deep state-space model. This +framework enables learning, visualizing, and clustering of temporal changes in +patient latent states related to disease progression. + We evaluated our framework using time-series laboratory data from 12,695 +cancer patients. By estimating latent states, we successfully discover latent +states related to prognosis. By visualization and cluster analysis, the +temporal transition of patient status and test items during state transitions +characteristic of each anticancer drug were identified. Our framework surpasses +existing methods in capturing interpretable latent space. It can be expected to +enhance our comprehension of disease progression from EHRs, aiding treatment +adjustments and prognostic determinations. + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ☆ A Deep Learning Approach for Overall Survival Analysis with Missing + Values + + +
+ One of the most challenging fields where Artificial Intelligence (AI) can be +applied is lung cancer research, specifically non-small cell lung cancer +(NSCLC). In particular, overall survival (OS) is a vital indicator of patient +status, helping to identify subgroups with diverse survival probabilities, +enabling tailored treatment and improved OS rates. In this analysis, there are +two challenges to take into account. First, few studies effectively exploit the +information available from each patient, leveraging both uncensored (i.e., +dead) and censored (i.e., survivors) patients, considering also the death +times. Second, the handling of incomplete data is a common issue in the medical +field. This problem is typically tackled through the use of imputation methods. +Our objective is to present an AI model able to overcome these limits, +effectively learning from both censored and uncensored patients and their +available features, for the prediction of OS for NSCLC patients. We present a +novel approach to survival analysis in the context of NSCLC, which exploits the +strengths of the transformer architecture accounting for only available +features without requiring any imputation strategy. By making use of ad-hoc +losses for OS, it accounts for both censored and uncensored patients, +considering risks over time. We evaluated the results over a period of 6 years +using different time granularities obtaining a Ct-index, a time-dependent +variant of the C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1 +year and 2 years, respectively, outperforming all state-of-the-art methods +regardless of the imputation method used. + +
+
+ comment: 19 pages, 2 figures +
+
+
+
+
+ + ☆ Improve Long-term Memory Learning Through Rescaling the Error Temporally + + +
+ This paper studies the error metric selection for long-term memory learning +in sequence modelling. We examine the bias towards short-term memory in +commonly used errors, including mean absolute/squared error. Our findings show +that all temporally positive-weighted errors are biased towards short-term +memory in learning linear functionals. To reduce this bias and improve +long-term memory learning, we propose the use of a temporally rescaled error. +In addition to reducing the bias towards short-term memory, this approach can +also alleviate the vanishing gradient issue. We conduct numerical experiments +on different long-memory tasks and sequence models to validate our claims. +Numerical results confirm the importance of appropriate temporally rescaled +error for effective long-term memory learning. To the best of our knowledge, +this is the first work that quantitatively analyzes different errors' memory +bias towards short-term memory in sequence modelling. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Neural Operators for Delay-Compensating Control of Hyperbolic PIDEs + + +
+ The recently introduced DeepONet operator-learning framework for PDE control +is extended from the results for basic hyperbolic and parabolic PDEs to an +advanced hyperbolic class that involves delays on both the state and the system +output or input. The PDE backstepping design produces gain functions that are +outputs of a nonlinear operator, mapping functions on a spatial domain into +functions on a spatial domain, and where this gain-generating operator's inputs +are the PDE's coefficients. The operator is approximated with a DeepONet neural +network to a degree of accuracy that is provably arbitrarily tight. Once we +produce this approximation-theoretic result in infinite dimension, with it we +establish stability in closed loop under feedback that employs approximate +gains. In addition to supplying such results under full-state feedback, we also +develop DeepONet-approximated observers and output-feedback laws and prove +their own stabilizing properties under neural operator approximations. With +numerical simulations we illustrate the theoretical results and quantify the +numerical effort savings, which are of two orders of magnitude, thanks to +replacing the numerical PDE solving with the DeepONet. + +
+
+
+
+
+ + ☆ Batching for Green AI -- An Exploratory Study on Inference + + +
+ The batch size is an essential parameter to tune during the development of +new neural networks. Amongst other quality indicators, it has a large degree of +influence on the model's accuracy, generalisability, training times and +parallelisability. This fact is generally known and commonly studied. However, +during the application phase of a deep learning model, when the model is +utilised by an end-user for inference, we find that there is a disregard for +the potential benefits of introducing a batch size. In this study, we examine +the effect of input batching on the energy consumption and response times of +five fully-trained neural networks for computer vision that were considered +state-of-the-art at the time of their publication. The results suggest that +batching has a significant effect on both of these metrics. Furthermore, we +present a timeline of the energy efficiency and accuracy of neural networks +over the past decade. We find that in general, energy consumption rises at a +much steeper pace than accuracy and question the necessity of this evolution. +Additionally, we highlight one particular network, ShuffleNetV2(2018), that +achieved a competitive performance for its time while maintaining a much lower +energy consumption. Nevertheless, we highlight that the results are model +dependent. + +
+
+ comment: 8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series + on Software Engineering and Advanced Applications (SEAA) 2023 +
+
+
+
+
+ + ☆ An Analysis of Multi-Agent Reinforcement Learning for Decentralized + Inventory Control Systems + + +
+ Most solutions to the inventory management problem assume a centralization of +information that is incompatible with organisational constraints in real supply +chain networks. The inventory management problem is a well-known planning +problem in operations research, concerned with finding the optimal re-order +policy for nodes in a supply chain. While many centralized solutions to the +problem exist, they are not applicable to real-world supply chains made up of +independent entities. The problem can however be naturally decomposed into +sub-problems, each associated with an independent entity, turning it into a +multi-agent system. Therefore, a decentralized data-driven solution to +inventory management problems using multi-agent reinforcement learning is +proposed where each entity is controlled by an agent. Three multi-agent +variations of the proximal policy optimization algorithm are investigated +through simulations of different supply chain networks and levels of +uncertainty. The centralized training decentralized execution framework is +deployed, which relies on offline centralization during simulation-based policy +identification, but enables decentralization when the policies are deployed +online to the real system. Results show that using multi-agent proximal policy +optimization with a centralized critic leads to performance very close to that +of a centralized data-driven solution and outperforms a distributed model-based +solution in most cases while respecting the information constraints of the +system. + +
+
+
+
+
+ + ☆ Attention to Entropic Communication + + +
+ The concept of attention, numerical weights that emphasize the importance of +particular data, has proven to be very relevant in artificial intelligence. +Relative entropy (RE, aka Kullback-Leibler divergence) plays a central role in +communication theory. Here we combine these concepts, attention and RE. RE +guides optimal encoding of messages in bandwidth-limited communication as well +as optimal message decoding via the maximum entropy principle (MEP). In the +coding scenario, RE can be derived from four requirements, namely being +analytical, local, proper, and calibrated. Weighted RE, used for attention +steering in communications, turns out to be improper. To see how proper +attention communication can emerge, we analyze a scenario of a message sender +who wants to ensure that the receiver of the message can perform well-informed +actions. If the receiver decodes the message using the MEP, the sender only +needs to know the receiver's utility function to inform optimally, but not the +receiver's initial knowledge state. In case only the curvature of the utility +function maxima are known, it becomes desirable to accurately communicate an +attention function, in this case a by this curvature weighted and re-normalized +probability function. Entropic attention communication is here proposed as the +desired generalization of entropic communication that permits weighting while +being proper, thereby aiding the design of optimal communication protocols in +technical applications and helping to understand human communication. For +example, our analysis shows how to derive the level of cooperation expected +under misaligned interests of otherwise honest communication partners. + +
+
+ comment: 23 pages, 4 figures, submitted +
+
+
+
+
+ + ☆ Direct and inverse modeling of soft robots by learning a condensed FEM + model + + +
+ The Finite Element Method (FEM) is a powerful modeling tool for predicting +the behavior of soft robots. However, its use for control can be difficult for +non-specialists of numerical computation: it requires an optimization of the +computation to make it real-time. In this paper, we propose a learning-based +approach to obtain a compact but sufficiently rich mechanical representation. +Our choice is based on nonlinear compliance data in the actuator/effector space +provided by a condensation of the FEM model. We demonstrate that this compact +model can be learned with a reasonable amount of data and, at the same time, be +very efficient in terms of modeling, since we can deduce the direct and inverse +kinematics of the robot. We also show how to couple some models learned +individually in particular on an example of a gripper composed of two soft +fingers. Other results are shown by comparing the inverse model derived from +the full FEM model and the one from the compact learned version. This work +opens new perspectives, namely for the embedded control of soft robots, but +also for their design. These perspectives are also discussed in the paper. + +
+
+
+
+
+ + ☆ Probabilistic Modeling of Inter- and Intra-observer Variability in + Medical Image Segmentation + + +
+ Medical image segmentation is a challenging task, particularly due to inter- +and intra-observer variability, even between medical experts. In this paper, we +propose a novel model, called Probabilistic Inter-Observer and iNtra-Observer +variation NetwOrk (Pionono). It captures the labeling behavior of each rater +with a multidimensional probability distribution and integrates this +information with the feature maps of the image to produce probabilistic +segmentation predictions. The model is optimized by variational inference and +can be trained end-to-end. It outperforms state-of-the-art models such as +STAPLE, Probabilistic U-Net, and models based on confusion matrices. +Additionally, Pionono predicts multiple coherent segmentation maps that mimic +the rater's expert opinion, which provides additional valuable information for +the diagnostic process. Experiments on real-world cancer segmentation datasets +demonstrate the high accuracy and efficiency of Pionono, making it a powerful +tool for medical image analysis. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Towards Better Fairness-Utility Trade-off: A Comprehensive + Measurement-Based Reinforcement Learning Framework + + +
+ Machine learning is widely used to make decisions with societal impact such +as bank loan approving, criminal sentencing, and resume filtering. How to +ensure its fairness while maintaining utility is a challenging but crucial +issue. Fairness is a complex and context-dependent concept with over 70 +different measurement metrics. Since existing regulations are often vague in +terms of which metric to use and different organizations may prefer different +fairness metrics, it is important to have means of improving fairness +comprehensively. Existing mitigation techniques often target at one specific +fairness metric and have limitations in improving multiple notions of fairness +simultaneously. In this work, we propose CFU (Comprehensive Fairness-Utility), +a reinforcement learning-based framework, to efficiently improve the +fairness-utility trade-off in machine learning classifiers. A comprehensive +measurement that can simultaneously consider multiple fairness notions as well +as utility is established, and new metrics are proposed based on an in-depth +analysis of the relationship between different fairness metrics. The reward +function of CFU is constructed with comprehensive measurement and new metrics. +We conduct extensive experiments to evaluate CFU on 6 tasks, 3 machine learning +models, and 15 fairness-utility measurements. The results demonstrate that CFU +can improve the classifier on multiple fairness metrics without sacrificing its +utility. It outperforms all state-of-the-art techniques and has witnessed a +37.5% improvement on average. + +
+
+
+
+
+ + ☆ LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent + Space + + +
+ Data Augmentation (DA) is a technique to increase the quantity and diversity +of the training data, and by that alleviate overfitting and improve +generalisation. However, standard DA produces synthetic data for augmentation +with limited diversity. Generative Adversarial Networks (GANs) may unlock +additional information in a dataset by generating synthetic samples having the +appearance of real images. However, these models struggle to simultaneously +address three key requirements: fidelity and high-quality samples; diversity +and mode coverage; and fast sampling. Indeed, GANs generate high-quality +samples rapidly, but have poor mode coverage, limiting their adoption in DA +applications. We propose LatentAugment, a DA strategy that overcomes the low +diversity of GANs, opening up for use in DA applications. Without external +supervision, LatentAugment modifies latent vectors and moves them into latent +space regions to maximise the synthetic images' diversity and fidelity. It is +also agnostic to the dataset and the downstream task. A wide set of experiments +shows that LatentAugment improves the generalisation of a deep model +translating from MRI-to-CT beating both standard DA as well GAN-based sampling. +Moreover, still in comparison with GAN-based sampling, LatentAugment synthetic +samples show superior mode coverage and diversity. Code is available at: +https://github.com/ltronchin/LatentAugment. + +
+
+
+
+
+ + ☆ Diverse Offline Imitation via Fenchel Duality + + +
+ There has been significant recent progress in the area of unsupervised skill +discovery, with various works proposing mutual information based objectives, as +a source of intrinsic motivation. Prior works predominantly focused on +designing algorithms that require online access to the environment. In +contrast, we develop an \textit{offline} skill discovery algorithm. Our problem +formulation considers the maximization of a mutual information objective +constrained by a KL-divergence. More precisely, the constraints ensure that the +state occupancy of each skill remains close to the state occupancy of an +expert, within the support of an offline dataset with good state-action +coverage. Our main contribution is to connect Fenchel duality, reinforcement +learning and unsupervised skill discovery, and to give a simple offline +algorithm for learning diverse skills that are aligned with an expert. + +
+
+
+
+
+ + ☆ Random Separating Hyperplane Theorem and Learning Polytopes + + +
+ The Separating Hyperplane theorem is a fundamental result in Convex Geometry +with myriad applications. Our first result, Random Separating Hyperplane +Theorem (RSH), is a strengthening of this for polytopes. $\rsh$ asserts that if +the distance between $a$ and a polytope $K$ with $k$ vertices and unit diameter +in $\Re^d$ is at least $\delta$, where $\delta$ is a fixed constant in $(0,1)$, +then a randomly chosen hyperplane separates $a$ and $K$ with probability at +least $1/poly(k)$ and margin at least $\Omega \left(\delta/\sqrt{d} \right)$. +An immediate consequence of our result is the first near optimal bound on the +error increase in the reduction from a Separation oracle to an Optimization +oracle over a polytope. + RSH has algorithmic applications in learning polytopes. We consider a +fundamental problem, denoted the ``Hausdorff problem'', of learning a unit +diameter polytope $K$ within Hausdorff distance $\delta$, given an optimization +oracle for $K$. Using RSH, we show that with polynomially many random queries +to the optimization oracle, $K$ can be approximated within error $O(\delta)$. +To our knowledge this is the first provable algorithm for the Hausdorff +Problem. Building on this result, we show that if the vertices of $K$ are +well-separated, then an optimization oracle can be used to generate a list of +points, each within Hausdorff distance $O(\delta)$ of $K$, with the property +that the list contains a point close to each vertex of $K$. Further, we show +how to prune this list to generate a (unique) approximation to each vertex of +the polytope. We prove that in many latent variable settings, e.g., topic +modeling, LDA, optimization oracles do exist provided we project to a suitable +SVD subspace. Thus, our work yields the first efficient algorithm for finding +approximations to the vertices of the latent polytope under the +well-separatedness assumption. + +
+
+
+
+
+ + ☆ Bridging the Reality Gap of Reinforcement Learning based Traffic Signal + Control using Domain Randomization and Meta Learning SC 2023 + + +
+ Reinforcement Learning (RL) has been widely explored in Traffic Signal +Control (TSC) applications, however, still no such system has been deployed in +practice. A key barrier to progress in this area is the reality gap, the +discrepancy that results from differences between simulation models and their +real-world equivalents. In this paper, we address this challenge by first +presenting a comprehensive analysis of potential simulation parameters that +contribute to this reality gap. We then also examine two promising strategies +that can bridge this gap: Domain Randomization (DR) and Model-Agnostic +Meta-Learning (MAML). Both strategies were trained with a traffic simulation +model of an intersection. In addition, the model was embedded in LemgoRL, a +framework that integrates realistic, safety-critical requirements into the +control system. Subsequently, we evaluated the performance of the two methods +on a separate model of the same intersection that was developed with a +different traffic simulator. In this way, we mimic the reality gap. Our +experimental results show that both DR and MAML outperform a state-of-the-art +RL algorithm, therefore highlighting their potential to mitigate the reality +gap in RLbased TSC systems. + +
+
+ comment: Paper was accepted by the ITSC 2023 (26th IEEE International + Conference on Intelligent Transportation Systems) +
+
+
+
+
+ + ☆ What can a Single Attention Layer Learn? A Study Through the Random + Features Lens + + +
+ Attention layers -- which map a sequence of inputs to a sequence of outputs +-- are core building blocks of the Transformer architecture which has achieved +significant breakthroughs in modern artificial intelligence. This paper +presents a rigorous theoretical study on the learning and generalization of a +single multi-head attention layer, with a sequence of key vectors and a +separate query vector as input. We consider the random feature setting where +the attention layer has a large number of heads, with randomly sampled frozen +query and key matrices, and trainable value matrices. We show that such a +random-feature attention layer can express a broad class of target functions +that are permutation invariant to the key vectors. We further provide +quantitative excess risk bounds for learning these target functions from finite +samples, using random feature attention with finitely many heads. + Our results feature several implications unique to the attention structure +compared with existing random features theory for neural networks, such as (1) +Advantages in the sample complexity over standard two-layer random-feature +networks; (2) Concrete and natural classes of functions that can be learned +efficiently by a random-feature attention layer; and (3) The effect of the +sampling distribution of the query-key weight matrix (the product of the query +and key matrix), where Gaussian random weights with a non-zero mean result in +better sample complexities over the zero-mean counterpart for learning certain +natural target functions. Experiments on simulated data corroborate our +theoretical findings and further illustrate the interplay between the sample +size and the complexity of the target function. + +
+
+ comment: 41pages, 5 figures +
+
+
+
+
+ + ☆ Model-based Offline Reinforcement Learning with Count-based Conservatism ICML 2023 + + +
+ In this paper, we propose a model-based offline reinforcement learning method +that integrates count-based conservatism, named $\texttt{Count-MORL}$. Our +method utilizes the count estimates of state-action pairs to quantify model +estimation error, marking the first algorithm of demonstrating the efficacy of +count-based conservatism in model-based offline deep RL to the best of our +knowledge. For our proposed method, we first show that the estimation error is +inversely proportional to the frequency of state-action pairs. Secondly, we +demonstrate that the learned policy under the count-based conservative model +offers near-optimality performance guarantees. Through extensive numerical +experiments, we validate that $\texttt{Count-MORL}$ with hash code +implementation significantly outperforms existing offline RL algorithms on the +D4RL benchmark datasets. The code is accessible at +$\href{https://github.com/oh-lab/Count-MORL}{https://github.com/oh-lab/Count-MORL}$. + +
+
+ comment: Accepted in ICML 2023 +
+
+
+
+
+ + ☆ Bounded P-values in Parametric Programming-based Selective Inference + + +
+ Selective inference (SI) has been actively studied as a promising framework +for statistical hypothesis testing for data-driven hypotheses. The basic idea +of SI is to make inferences conditional on an event that a hypothesis is +selected. In order to perform SI, this event must be characterized in a +traceable form. When selection event is too difficult to characterize, +additional conditions are introduced for tractability. This additional +conditions often causes the loss of power, and this issue is referred to as +over-conditioning. Parametric programming-based SI (PP-based SI) has been +proposed as one way to address the over-conditioning issue. The main problem of +PP-based SI is its high computational cost due to the need to exhaustively +explore the data space. In this study, we introduce a procedure to reduce the +computational cost while guaranteeing the desired precision, by proposing a +method to compute the upper and lower bounds of p-values. We also proposed +three types of search strategies that efficiently improve these bounds. We +demonstrate the effectiveness of the proposed method in hypothesis testing +problems for feature selection in linear models and attention region +identification in deep neural networks. + +
+
+ comment: 47pages, 14figures +
+
+
+
+
+ + ☆ Improving Transferability of Adversarial Examples via Bayesian Attacks + + +
+ This paper presents a substantial extension of our work published at ICLR. +Our ICLR work advocated for enhancing transferability in adversarial examples +by incorporating a Bayesian formulation into model parameters, which +effectively emulates the ensemble of infinitely many deep neural networks, +while, in this paper, we introduce a novel extension by incorporating the +Bayesian formulation into the model input as well, enabling the joint +diversification of both the model input and model parameters. Our empirical +findings demonstrate that: 1) the combination of Bayesian formulations for both +the model input and model parameters yields significant improvements in +transferability; 2) by introducing advanced approximations of the posterior +distribution over the model input, adversarial transferability achieves further +enhancement, surpassing all state-of-the-arts when attacking without model +fine-tuning. Moreover, we propose a principled approach to fine-tune model +parameters in such an extended Bayesian formulation. The derived optimization +objective inherently encourages flat minima in the parameter space and input +space. Extensive experiments demonstrate that our method achieves a new +state-of-the-art on transfer-based attacks, improving the average success rate +on ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with +our ICLR basic Bayesian method. We will make our code publicly available. + +
+
+
+
+
+ + ☆ Demystifying Local and Global Fairness Trade-offs in Federated Learning + Using Partial Information Decomposition ICML + + +
+ In this paper, we present an information-theoretic perspective to group +fairness trade-offs in federated learning (FL) with respect to sensitive +attributes, such as gender, race, etc. Existing works mostly focus on either +\emph{global fairness} (overall disparity of the model across all clients) or +\emph{local fairness} (disparity of the model at each individual client), +without always considering their trade-offs. There is a lack of understanding +of the interplay between global and local fairness in FL, and if and when one +implies the other. To address this gap, we leverage a body of work in +information theory called partial information decomposition (PID) which first +identifies three sources of unfairness in FL, namely, \emph{Unique Disparity}, +\emph{Redundant Disparity}, and \emph{Masked Disparity}. Using canonical +examples, we demonstrate how these three disparities contribute to global and +local fairness. This decomposition helps us derive fundamental limits and +trade-offs between global or local fairness, particularly under data +heterogeneity, as well as, derive conditions under which one implies the other. +We also present experimental results on benchmark datasets to support our +theoretical findings. This work offers a more nuanced understanding of the +sources of disparity in FL that can inform the use of local disparity +mitigation techniques, and their convergence and effectiveness when deployed in +practice. + +
+
+ comment: Accepted at ICML Workshop on Federated Learning and Analytics in + Practice +
+
+
+
+
+ + ☆ Beyond Convergence: Identifiability of Machine Learning and Deep + Learning Models + + +
+ Machine learning (ML) and deep learning models are extensively used for +parameter optimization and regression problems. However, not all inverse +problems in ML are ``identifiable,'' indicating that model parameters may not +be uniquely determined from the available data and the data model's +input-output relationship. In this study, we investigate the notion of model +parameter identifiability through a case study focused on parameter estimation +from motion sensor data. Utilizing a bipedal-spring mass human walk dynamics +model, we generate synthetic data representing diverse gait patterns and +conditions. Employing a deep neural network, we attempt to estimate +subject-wise parameters, including mass, stiffness, and equilibrium leg length. +The results show that while certain parameters can be identified from the +observation data, others remain unidentifiable, highlighting that +unidentifiability is an intrinsic limitation of the experimental setup, +necessitating a change in data collection and experimental scenarios. Beyond +this specific case study, the concept of identifiability has broader +implications in ML and deep learning. Addressing unidentifiability requires +proven identifiable models (with theoretical support), multimodal data fusion +techniques, and advancements in model-based machine learning. Understanding and +resolving unidentifiability challenges will lead to more reliable and accurate +applications across diverse domains, transcending mere model convergence and +enhancing the reliability of machine learning models. + +
+
+
+
+
+ + ☆ Systematic Adaptation of Communication-focused Machine Learning Models + from Real to Virtual Environments for Human-Robot Collaboration + + +
+ Virtual reality has proved to be useful in applications in several fields +ranging from gaming, medicine, and training to development of interfaces that +enable human-robot collaboration. It empowers designers to explore applications +outside of the constraints posed by the real world environment and develop +innovative solutions and experiences. Hand gestures recognition which has been +a topic of much research and subsequent commercialization in the real world has +been possible because of the creation of large, labelled datasets. In order to +utilize the power of natural and intuitive hand gestures in the virtual domain +for enabling embodied teleoperation of collaborative robots, similarly large +datasets must be created so as to keep the working interface easy to learn and +flexible enough to add more gestures. Depending on the application, this may be +computationally or economically prohibitive. Thus, the adaptation of trained +deep learning models that perform well in the real environment to the virtual +may be a solution to this challenge. This paper presents a systematic framework +for the real to virtual adaptation using limited size of virtual dataset along +with guidelines for creating a curated dataset. Finally, while hand gestures +have been considered as the communication mode, the guidelines and +recommendations presented are generic. These are applicable to other modes such +as body poses and facial expressions which have large datasets available in the +real domain which must be adapted to the virtual one. + +
+
+
+
+
+ + ☆ Analysis of Elephant Movement in Sub-Saharan Africa: Ecological, + Climatic, and Conservation Perspectives + + +
+ The interaction between elephants and their environment has profound +implications for both ecology and conservation strategies. This study presents +an analytical approach to decipher the intricate patterns of elephant movement +in Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal +variations and rainfall patterns. Despite the complexities surrounding these +influential factors, our analysis provides a holistic view of elephant +migratory behavior in the context of the dynamic African landscape. Our +comprehensive approach enables us to predict the potential impact of these +ecological determinants on elephant migration, a critical step in establishing +informed conservation strategies. This projection is particularly crucial given +the impacts of global climate change on seasonal and rainfall patterns, which +could substantially influence elephant movements in the future. The findings of +our work aim to not only advance the understanding of movement ecology but also +foster a sustainable coexistence of humans and elephants in Sub-Saharan Africa. +By predicting potential elephant routes, our work can inform strategies to +minimize human-elephant conflict, effectively manage land use, and enhance +anti-poaching efforts. This research underscores the importance of integrating +movement ecology and climatic variables for effective wildlife management and +conservation planning. + +
+
+ comment: 11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on + Computing and Sustainable Societies (COMPASS 2023) +
+
+
+
+
+ + ☆ XLDA: Linear Discriminant Analysis for Scaling Continual Learning to + Extreme Classification at the Edge ICML 2023 + + +
+ Streaming Linear Discriminant Analysis (LDA) while proven in +Class-incremental Learning deployments at the edge with limited classes (upto +1000), has not been proven for deployment in extreme classification scenarios. +In this paper, we present: (a) XLDA, a framework for Class-IL in edge +deployment where LDA classifier is proven to be equivalent to FC layer +including in extreme classification scenarios, and (b) optimizations to enable +XLDA-based training and inference for edge deployment where there is a +constraint on available compute resources. We show up to 42x speed up using a +batched training approach and up to 5x inference speedup with nearest neighbor +search on extreme datasets like AliProducts (50k classes) and Google Landmarks +V2 (81k classes) + +
+
+ comment: Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop +
+
+
+
+
+ + ☆ Making Pre-trained Language Models both Task-solvers and + Self-calibrators ACL 2023 + + +
+ Pre-trained language models (PLMs) serve as backbones for various real-world +systems. For high-stake applications, it's equally essential to have reasonable +confidence estimations in predictions. While the vanilla confidence scores of +PLMs can already be effectively utilized, PLMs consistently become +overconfident in their wrong predictions, which is not desirable in practice. +Previous work shows that introducing an extra calibration task can mitigate +this issue. The basic idea involves acquiring additional data to train models +in predicting the confidence of their initial predictions. However, it only +demonstrates the feasibility of this kind of method, assuming that there are +abundant extra available samples for the introduced calibration task. In this +work, we consider the practical scenario that we need to effectively utilize +training samples to make PLMs both task-solvers and self-calibrators. Three +challenges are presented, including limited training samples, data imbalance, +and distribution shifts. We first conduct pilot experiments to quantify various +decisive factors in the calibration task. Based on the empirical analysis +results, we propose a training algorithm LM-TOAST to tackle the challenges. +Experimental results show that LM-TOAST can effectively utilize the training +data to make PLMs have reasonable confidence estimations while maintaining the +original task performance. Further, we consider three downstream applications, +namely selective classification, adversarial defense, and model cascading, to +show the practical usefulness of LM-TOAST. The code will be made public at +\url{https://github.com/Yangyi-Chen/LM-TOAST}. + +
+
+ comment: Accepted to Findings of ACL 2023 +
+
+
+
+
+ + ☆ Neuromorphic Online Learning for Spatiotemporal Patterns with a + Forward-only Timeline + + +
+ Spiking neural networks (SNNs) are bio-plausible computing models with high +energy efficiency. The temporal dynamics of neurons and synapses enable them to +detect temporal patterns and generate sequences. While Backpropagation Through +Time (BPTT) is traditionally used to train SNNs, it is not suitable for online +learning of embedded applications due to its high computation and memory cost +as well as extended latency. Previous works have proposed online learning +algorithms, but they often utilize highly simplified spiking neuron models +without synaptic dynamics and reset feedback, resulting in subpar performance. +In this work, we present Spatiotemporal Online Learning for Synaptic Adaptation +(SOLSA), specifically designed for online learning of SNNs composed of Leaky +Integrate and Fire (LIF) neurons with exponentially decayed synapses and soft +reset. The algorithm not only learns the synaptic weight but also adapts the +temporal filters associated to the synapses. Compared to the BPTT algorithm, +SOLSA has much lower memory requirement and achieves a more balanced temporal +workload distribution. Moreover, SOLSA incorporates enhancement techniques such +as scheduled weight update, early stop training and adaptive synapse filter, +which speed up the convergence and enhance the learning performance. When +compared to other non-BPTT based SNN learning, SOLSA demonstrates an average +learning accuracy improvement of 14.2%. Furthermore, compared to BPTT, SOLSA +achieves a 5% higher average learning accuracy with a 72% reduction in memory +cost. + +
+
+ comment: 9 pages,8 figures +
+
+
+
+
+ + ☆ PI-VEGAN: Physics Informed Variational Embedding Generative Adversarial + Networks for Stochastic Differential Equations + + +
+ We present a new category of physics-informed neural networks called physics +informed variational embedding generative adversarial network (PI-VEGAN), that +effectively tackles the forward, inverse, and mixed problems of stochastic +differential equations. In these scenarios, the governing equations are known, +but only a limited number of sensor measurements of the system parameters are +available. We integrate the governing physical laws into PI-VEGAN with +automatic differentiation, while introducing a variational encoder for +approximating the latent variables of the actual distribution of the +measurements. These latent variables are integrated into the generator to +facilitate accurate learning of the characteristics of the stochastic partial +equations. Our model consists of three components, namely the encoder, +generator, and discriminator, each of which is updated alternatively employing +the stochastic gradient descent algorithm. We evaluate the effectiveness of +PI-VEGAN in addressing forward, inverse, and mixed problems that require the +concurrent calculation of system parameters and solutions. Numerical results +demonstrate that the proposed method achieves satisfactory stability and +accuracy in comparison with the previous physics-informed generative +adversarial network (PI-WGAN). + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Kernelized Offline Contextual Dueling Bandits + + +
+ Preference-based feedback is important for many applications where direct +evaluation of a reward function is not feasible. A notable recent example +arises in reinforcement learning from human feedback on large language models. +For many of these applications, the cost of acquiring the human feedback can be +substantial or even prohibitive. In this work, we take advantage of the fact +that often the agent can choose contexts at which to obtain human feedback in +order to most efficiently identify a good policy, and introduce the offline +contextual dueling bandit setting. We give an upper-confidence-bound style +algorithm for this setting and prove a regret bound. We also give empirical +confirmation that this method outperforms a similar strategy that uses +uniformly sampled contexts. + +
+
+
+
+
+ + ☆ MAS: Towards Resource-Efficient Federated Multiple-Task Learning ICCV'23 + + +
+ Federated learning (FL) is an emerging distributed machine learning method +that empowers in-situ model training on decentralized edge devices. However, +multiple simultaneous FL tasks could overload resource-constrained devices. In +this work, we propose the first FL system to effectively coordinate and train +multiple simultaneous FL tasks. We first formalize the problem of training +simultaneous FL tasks. Then, we present our new approach, MAS (Merge and +Split), to optimize the performance of training multiple simultaneous FL tasks. +MAS starts by merging FL tasks into an all-in-one FL task with a multi-task +architecture. After training for a few rounds, MAS splits the all-in-one FL +task into two or more FL tasks by using the affinities among tasks measured +during the all-in-one training. It then continues training each split of FL +tasks based on model parameters from the all-in-one training. Extensive +experiments demonstrate that MAS outperforms other methods while reducing +training time by 2x and reducing energy consumption by 40%. We hope this work +will inspire the community to further study and optimize training simultaneous +FL tasks. + +
+
+ comment: ICCV'23. arXiv admin note: substantial text overlap with + arXiv:2207.04202 +
+
+
+
+
+ + ☆ Epsilon*: Privacy Metric for Machine Learning Models + + +
+ We introduce Epsilon*, a new privacy metric for measuring the privacy risk of +a single model instance prior to, during, or after deployment of privacy +mitigation strategies. The metric does not require access to the training data +sampling or model training algorithm. Epsilon* is a function of true positive +and false positive rates in a hypothesis test used by an adversary in a +membership inference attack. We distinguish between quantifying the privacy +loss of a trained model instance and quantifying the privacy loss of the +training mechanism which produces this model instance. Existing approaches in +the privacy auditing literature provide lower bounds for the latter, while our +metric provides a lower bound for the former by relying on an +(${\epsilon}$,${\delta}$)-type of quantification of the privacy of the trained +model instance. We establish a relationship between these lower bounds and show +how to implement Epsilon* to avoid numerical and noise amplification +instability. We further show in experiments on benchmark public data sets that +Epsilon* is sensitive to privacy risk mitigation by training with differential +privacy (DP), where the value of Epsilon* is reduced by up to 800% compared to +the Epsilon* values of non-DP trained baseline models. This metric allows +privacy auditors to be independent of model owners, and enables all +decision-makers to visualize the privacy-utility landscape to make informed +decisions regarding the trade-offs between model privacy and utility. + +
+
+
+
+
+ + ☆ Screening Mammography Breast Cancer Detection + + +
+ Breast cancer is a leading cause of cancer-related deaths, but current +programs are expensive and prone to false positives, leading to unnecessary +follow-up and patient anxiety. This paper proposes a solution to automated +breast cancer detection, to improve the efficiency and accuracy of screening +programs. Different methodologies were tested against the RSNA dataset of +radiographic breast images of roughly 20,000 female patients and yielded an +average validation case pF1 score of 0.56 across methods. + +
+
+ comment: Released @ Apr 2023. For associated project files, see + https://github.com/chakrabortyde/rsna-breast-cancer +
+
+
+
+
+ + ♻ ☆ Tight Bounds for $γ$-Regret via the Decision-Estimation Coefficient + + +
+ In this work, we give a statistical characterization of the $\gamma$-regret +for arbitrary structured bandit problems, the regret which arises when +comparing against a benchmark that is $\gamma$ times the optimal solution. The +$\gamma$-regret emerges in structured bandit problems over a function class +$\mathcal{F}$ where finding an exact optimum of $f \in \mathcal{F}$ is +intractable. Our characterization is given in terms of the $\gamma$-DEC, a +statistical complexity parameter for the class $\mathcal{F}$, which is a +modification of the constrained Decision-Estimation Coefficient (DEC) of Foster +et al., 2023 (and closely related to the original offset DEC of Foster et al., +2021). Our lower bound shows that the $\gamma$-DEC is a fundamental limit for +any model class $\mathcal{F}$: for any algorithm, there exists some $f \in +\mathcal{F}$ for which the $\gamma$-regret of that algorithm scales (nearly) +with the $\gamma$-DEC of $\mathcal{F}$. We provide an upper bound showing that +there exists an algorithm attaining a nearly matching $\gamma$-regret. Due to +significant challenges in applying the prior results on the DEC to the +$\gamma$-regret case, both our lower and upper bounds require novel techniques +and a new algorithm. + +
+
+
+
+
+ + ♻ ☆ On Provable Copyright Protection for Generative Models ICML 2023 + + +
+ There is a growing concern that learned conditional generative models may +output samples that are substantially similar to some copyrighted data $C$ that +was in their training set. We give a formal definition of $\textit{near +access-freeness (NAF)}$ and prove bounds on the probability that a model +satisfying this definition outputs a sample similar to $C$, even if $C$ is +included in its training set. Roughly speaking, a generative model $p$ is +$\textit{$k$-NAF}$ if for every potentially copyrighted data $C$, the output of +$p$ diverges by at most $k$-bits from the output of a model $q$ that +$\textit{did not access $C$ at all}$. We also give generative model learning +algorithms, which efficiently modify the original generative model learning +algorithm in a black box manner, that output generative models with strong +bounds on the probability of sampling protected content. Furthermore, we +provide promising experiments for both language (transformers) and image +(diffusion) generative models, showing minimal degradation in output quality +while ensuring strong protections against sampling protected content. + +
+
+ comment: Accepted at ICML 2023 +
+
+
+
+
+ + ♻ ☆ A Competitive Learning Approach for Specialized Models: A Solution for + Complex Physical Systems with Distinct Functional Regimes + + +
+ Complex systems in science and engineering sometimes exhibit behavior that +changes across different regimes. Traditional global models struggle to capture +the full range of this complex behavior, limiting their ability to accurately +represent the system. In response to this challenge, we propose a novel +competitive learning approach for obtaining data-driven models of physical +systems. The primary idea behind the proposed approach is to employ dynamic +loss functions for a set of models that are trained concurrently on the data. +Each model competes for each observation during training, allowing for the +identification of distinct functional regimes within the dataset. To +demonstrate the effectiveness of the learning approach, we coupled it with +various regression methods that employ gradient-based optimizers for training. +The proposed approach was tested on various problems involving model discovery +and function approximation, demonstrating its ability to successfully identify +functional regimes, discover true governing equations, and reduce test errors. + +
+
+
+
+
+ + ♻ ☆ A Neural Network Warm-Start Approach for the Inverse Acoustic Obstacle + Scattering Problem + + +
+ We consider the inverse acoustic obstacle problem for sound-soft star-shaped +obstacles in two dimensions wherein the boundary of the obstacle is determined +from measurements of the scattered field at a collection of receivers outside +the object. One of the standard approaches for solving this problem is to +reformulate it as an optimization problem: finding the boundary of the domain +that minimizes the $L^2$ distance between computed values of the scattered +field and the given measurement data. The optimization problem is +computationally challenging since the local set of convexity shrinks with +increasing frequency and results in an increasing number of local minima in the +vicinity of the true solution. In many practical experimental settings, low +frequency measurements are unavailable due to limitations of the experimental +setup or the sensors used for measurement. Thus, obtaining a good initial guess +for the optimization problem plays a vital role in this environment. + We present a neural network warm-start approach for solving the inverse +scattering problem, where an initial guess for the optimization problem is +obtained using a trained neural network. We demonstrate the effectiveness of +our method with several numerical examples. For high frequency problems, this +approach outperforms traditional iterative methods such as Gauss-Newton +initialized without any prior (i.e., initialized using a unit circle), or +initialized using the solution of a direct method such as the linear sampling +method. The algorithm remains robust to noise in the scattered field +measurements and also converges to the true solution for limited aperture data. +However, the number of training samples required to train the neural network +scales exponentially in frequency and the complexity of the obstacles +considered. We conclude with a discussion of this phenomenon and potential +directions for future research. + +
+
+
+
+
+ + ♻ ☆ Embedding Contextual Information through Reward Shaping in Multi-Agent + Learning: A Case Study from Google Football + + +
+ Artificial Intelligence has been used to help human complete difficult tasks +in complicated environments by providing optimized strategies for +decision-making or replacing the manual labour. In environments including +multiple agents, such as football, the most common methods to train agents are +Imitation Learning and Multi-Agent Reinforcement Learning (MARL). However, the +agents trained by Imitation Learning cannot outperform the expert demonstrator, +which makes humans hardly get new insights from the learnt policy. Besides, +MARL is prone to the credit assignment problem. In environments with sparse +reward signal, this method can be inefficient. The objective of our research is +to create a novel reward shaping method by embedding contextual information in +reward function to solve the aforementioned challenges. We demonstrate this in +the Google Research Football (GRF) environment. We quantify the contextual +information extracted from game state observation and use this quantification +together with original sparse reward to create the shaped reward. The +experiment results in the GRF environment prove that our reward shaping method +is a useful addition to state-of-the-art MARL algorithms for training agents in +environments with sparse reward signal. + +
+
+
+
+
+ + ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in + Multi-Modal LLMs + + +
+ We demonstrate how images and sounds can be used for indirect prompt and +instruction injection in multi-modal LLMs. An attacker generates an adversarial +perturbation corresponding to the prompt and blends it into an image or audio +recording. When the user asks the (unmodified, benign) model about the +perturbed image or audio, the perturbation steers the model to output the +attacker-chosen text and/or make the subsequent dialog follow the attacker's +instruction. We illustrate this attack with several proof-of-concept examples +targeting LLaVa and PandaGPT. + +
+
+
+
+
+ + ♻ ☆ Universal consistency of the $k$-NN rule in metric spaces and Nagata + dimension. II + + +
+ We continue to investigate the $k$ nearest neighbour learning rule in +separable metric spaces. Thanks to the results of C\'erou and Guyader (2006) +and Preiss (1983), this rule is known to be universally consistent in every +metric space $X$ that is sigma-finite dimensional in the sense of Nagata. Here +we show that the rule is strongly universally consistent in such spaces in the +absence of ties. Under the tie-breaking strategy applied by Devroye, +Gy\"{o}rfi, Krzy\.{z}ak, and Lugosi (1994) in the Euclidean setting, we manage +to show the strong universal consistency in non-Archimedian metric spaces (that +is, those of Nagata dimension zero). Combining the theorem of C\'erou and +Guyader with results of Assouad and Quentin de Gromard (2006), one deduces that +the $k$-NN rule is universally consistent in metric spaces having finite +dimension in the sense of de Groot. In particular, the $k$-NN rule is +universally consistent in the Heisenberg group which is not sigma-finite +dimensional in the sense of Nagata as follows from an example independently +constructed by Kor\'anyi and Reimann (1995) and Sawyer and Wheeden (1992). + +
+
+ comment: Latex 2e, 17 pages. The Heisenberg group is now presented in more + detail, with some proofs and more references added, and a discussion of open + problems added at the end +
+
+
+
+
+ + ♻ ☆ CALDA: Improving Multi-Source Time Series Domain Adaptation with + Contrastive Adversarial Learning + + +
+ Unsupervised domain adaptation (UDA) provides a strategy for improving +machine learning performance in data-rich (target) domains where ground truth +labels are inaccessible but can be found in related (source) domains. In cases +where meta-domain information such as label distributions is available, weak +supervision can further boost performance. We propose a novel framework, CALDA, +to tackle these two problems. CALDA synergistically combines the principles of +contrastive learning and adversarial learning to robustly support multi-source +UDA (MS-UDA) for time series data. Similar to prior methods, CALDA utilizes +adversarial learning to align source and target feature representations. Unlike +prior approaches, CALDA additionally leverages cross-source label information +across domains. CALDA pulls examples with the same label close to each other, +while pushing apart examples with different labels, reshaping the space through +contrastive learning. Unlike prior contrastive adaptation methods, CALDA +requires neither data augmentation nor pseudo labeling, which may be more +challenging for time series. We empirically validate our proposed approach. +Based on results from human activity recognition, electromyography, and +synthetic datasets, we find utilizing cross-source information improves +performance over prior time series and contrastive methods. Weak supervision +further improves performance, even in the presence of noise, allowing CALDA to +offer generalizable strategies for MS-UDA. Code is available at: +https://github.com/floft/calda + +
+
+ comment: Accepted at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ Torchhd: An Open Source Python Library to Support Research on + Hyperdimensional Computing and Vector Symbolic Architectures + + +
+ Hyperdimensional computing (HD), also known as vector symbolic architectures +(VSA), is a framework for computing with distributed representations by +exploiting properties of random high-dimensional vector spaces. The commitment +of the scientific community to aggregate and disseminate research in this +particularly multidisciplinary area has been fundamental for its advancement. +Joining these efforts, we present Torchhd, a high-performance open source +Python library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and +serves as an efficient foundation for further research and application +development. The easy-to-use library builds on top of PyTorch and features +state-of-the-art HD/VSA functionality, clear documentation, and implementation +examples from well-known publications. Comparing publicly available code with +their corresponding Torchhd implementation shows that experiments can run up to +100x faster. Torchhd is available at: +https://github.com/hyperdimensional-computing/torchhd. + +
+
+
+
+
+ + ♻ ☆ Multiresolution Graph Transformers and Wavelet Positional Encoding for + Learning Hierarchical Structures + + +
+ Contemporary graph learning algorithms are not well-defined for large +molecules since they do not consider the hierarchical interactions among the +atoms, which are essential to determine the molecular properties of +macromolecules. In this work, we propose Multiresolution Graph Transformers +(MGT), the first graph transformer architecture that can learn to represent +large molecules at multiple scales. MGT can learn to produce representations +for the atoms and group them into meaningful functional groups or repeating +units. We also introduce Wavelet Positional Encoding (WavePE), a new positional +encoding method that can guarantee localization in both spectral and spatial +domains. Our proposed model achieves competitive results on two macromolecule +datasets consisting of polymers and peptides, and one drug-like molecule +dataset. Importantly, our model outperforms other state-of-the-art methods and +achieves chemical accuracy in estimating molecular properties (e.g., GAP, HOMO +and LUMO) calculated by Density Functional Theory (DFT) in the polymers +dataset. Furthermore, the visualizations, including clustering results on +macromolecules and low-dimensional spaces of their representations, demonstrate +the capability of our methodology in learning to represent long-range and +hierarchical structures. Our PyTorch implementation is publicly available at +https://github.com/HySonLab/Multires-Graph-Transformer + +
+
+
+
+
+ + ♻ ☆ A Unified Algorithm Framework for Unsupervised Discovery of Skills based + on Determinantal Point Process + + +
+ Learning rich skills through temporal abstractions without supervision of +external rewards is at the frontier of Reinforcement Learning research. +Existing works mainly fall into two distinctive categories: variational and +Laplacian-based skill (a.k.a., option) discovery. The former maximizes the +diversity of the discovered options through a mutual information loss but +overlooks coverage of the state space, while the latter focuses on improving +the coverage of options by increasing connectivity during exploration, but does +not consider diversity. In this paper, we propose a unified framework that +quantifies diversity and coverage through a novel use of the Determinantal +Point Process (DPP) and enables unsupervised option discovery explicitly +optimizing both objectives. Specifically, we define the DPP kernel matrix with +the Laplacian spectrum of the state transition graph and use the expected mode +number in the trajectories as the objective to capture and enhance both +diversity and coverage of the learned options. The proposed option discovery +algorithm is extensively evaluated using challenging tasks built with Mujoco +and Atari, demonstrating that our proposed algorithm substantially outperforms +SOTA baselines from both diversity- and coverage-driven categories. The codes +are available at https://github.com/LucasCJYSDL/ODPP. + +
+
+
+
+
+ + ♻ ☆ Learning Multi-agent Skills for Tabular Reinforcement Learning using + Factor Graphs + + +
+ Covering skill (a.k.a., option) discovery has been developed to improve the +exploration of reinforcement learning in single-agent scenarios with sparse +reward signals, through connecting the most distant states in the embedding +space provided by the Fiedler vector of the state transition graph. However, +these option discovery methods cannot be directly extended to multi-agent +scenarios, since the joint state space grows exponentially with the number of +agents in the system. Thus, existing researches on adopting options in +multi-agent scenarios still rely on single-agent option discovery and fail to +directly discover the joint options that can improve the connectivity of the +joint state space of agents. In this paper, we show that it is indeed possible +to directly compute multi-agent options with collaborative exploratory +behaviors among the agents, while still enjoying the ease of decomposition. Our +key idea is to approximate the joint state space as a Kronecker graph -- the +Kronecker product of individual agents' state transition graphs, based on which +we can directly estimate the Fiedler vector of the joint state space using the +Laplacian spectrum of individual agents' transition graphs. This decomposition +enables us to efficiently construct multi-agent joint options by encouraging +agents to connect the sub-goal joint states which are corresponding to the +minimum or maximum values of the estimated joint Fiedler vector. The evaluation +based on multi-agent collaborative tasks shows that the proposed algorithm can +successfully identify multi-agent options, and significantly outperforms prior +works using single-agent options or no options, in terms of both faster +exploration and higher cumulative rewards. + +
+
+
+
+
+ + ♻ ☆ Multi-agent Deep Covering Skill Discovery ICML + + +
+ The use of skills (a.k.a., options) can greatly accelerate exploration in +reinforcement learning, especially when only sparse reward signals are +available. While option discovery methods have been proposed for individual +agents, in multi-agent reinforcement learning settings, discovering +collaborative options that can coordinate the behavior of multiple agents and +encourage them to visit the under-explored regions of their joint state space +has not been considered. In this case, we propose Multi-agent Deep Covering +Option Discovery, which constructs the multi-agent options through minimizing +the expected cover time of the multiple agents' joint state space. Also, we +propose a novel framework to adopt the multi-agent options in the MARL process. +In practice, a multi-agent task can usually be divided into some sub-tasks, +each of which can be completed by a sub-group of the agents. Therefore, our +algorithm framework first leverages an attention mechanism to find +collaborative agent sub-groups that would benefit most from coordinated +actions. Then, a hierarchical algorithm, namely HA-MSAC, is developed to learn +the multi-agent options for each sub-group to complete their sub-tasks first, +and then to integrate them through a high-level policy as the solution of the +whole task. This hierarchical option construction allows our framework to +strike a balance between scalability and effective collaboration among the +agents. The evaluation based on multi-agent collaborative tasks shows that the +proposed algorithm can effectively capture the agent interactions with the +attention mechanism, successfully identify multi-agent options, and +significantly outperforms prior works using single-agent options or no options, +in terms of both faster exploration and higher task rewards. + +
+
+ comment: This paper was presented in part at the ICML Reinforcement Learning + for Real Life Workshop, July 2021 +
+
+
+
+
+ + ♻ ☆ Conditional Diffusion Models for Semantic 3D Medical Image Synthesis + + +
+ The demand for artificial intelligence (AI) in healthcare is rapidly +increasing. However, significant challenges arise from data scarcity and +privacy concerns, particularly in medical imaging. While existing generative +models have achieved success in image synthesis and image-to-image translation +tasks, there remains a gap in the generation of 3D semantic medical images. To +address this gap, we introduce Med-DDPM, a diffusion model specifically +designed for semantic 3D medical image synthesis, effectively tackling data +scarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation +of semantic conditioning, enabling precise control during the image generation +process. Our model outperforms Generative Adversarial Networks (GANs) in terms +of stability and performance, generating diverse and anatomically coherent +images with high visual fidelity. Comparative analysis against state-of-the-art +augmentation techniques demonstrates that Med-DDPM produces comparable results, +highlighting its potential as a data augmentation tool for enhancing model +accuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis +by delivering high-quality and anatomically coherent images. Furthermore, the +integration of semantic conditioning with Med-DDPM holds promise for image +anonymization in the field of biomedical imaging, showcasing the capabilities +of the model in addressing challenges related to data scarcity and privacy +concerns. + +
+
+
+
+
+ + ♻ ☆ SpArX: Sparse Argumentative Explanations for Neural Networks ECAI + + +
+ Neural networks (NNs) have various applications in AI, but explaining their +decisions remains challenging. Existing approaches often focus on explaining +how changing individual inputs affects NNs' outputs. However, an explanation +that is consistent with the input-output behaviour of an NN is not necessarily +faithful to the actual mechanics thereof. In this paper, we exploit +relationships between multi-layer perceptrons (MLPs) and quantitative +argumentation frameworks (QAFs) to create argumentative explanations for the +mechanics of MLPs. Our SpArX method first sparsifies the MLP while maintaining +as much of the original structure as possible. It then translates the sparse +MLP into an equivalent QAF to shed light on the underlying decision process of +the MLP, producing global and/or local explanations. We demonstrate +experimentally that SpArX can give more faithful explanations than existing +approaches, while simultaneously providing deeper insights into the actual +reasoning process of MLPs. + +
+
+ comment: Accepted at the European Conference on Artificial Intelligence (ECAI) + 2023 Conference +
+
+
+
+
+ + ♻ ☆ Reduction of finite sampling noise in quantum neural networks + + +
+ Quantum neural networks (QNNs) use parameterized quantum circuits with +data-dependent inputs and generate outputs through the evaluation of +expectation values. Calculating these expectation values necessitates repeated +circuit evaluations, thus introducing fundamental finite-sampling noise even on +error-free quantum computers. We reduce this noise by introducing the variance +regularization, a technique for reducing the variance of the expectation value +during the quantum model training. This technique requires no additional +circuit evaluations if the QNN is properly constructed. Our empirical findings +demonstrate the reduced variance speeds up the training and lowers the output +noise as well as decreases the number of necessary evaluations of gradient +circuits. This regularization method is benchmarked on the regression of +multiple functions. We show that in our examples, it lowers the variance by an +order of magnitude on average and leads to a significantly reduced noise level +of the QNN. We finally demonstrate QNN training on a real quantum device and +evaluate the impact of error mitigation. Here, the optimization is feasible +only due to the reduced number of necessary shots in the gradient evaluation +resulting from the reduced variance. + +
+
+ comment: 11 pages, 10 figures; refined section 5 +
+
+
+
+
+ + ♻ ☆ Self-Supervised Hyperspectral Inpainting with the Optimisation inspired + Deep Neural Network Prior SC + + +
+ Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral +bands, conveying a wealth of spatial and spectral information. However, due to +the instrumental errors and the atmospheric changes, the HSI obtained in +practice are often contaminated by noise and dead pixels(lines), resulting in +missing information that may severely compromise the subsequent applications. +We introduce here a novel HSI missing pixel prediction algorithm, called Low +Rank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP +is able to predict missing pixels and bands even when all spectral bands of the +image are missing. The proposed LRS-PnP algorithm is further extended to a +self-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP), +called LRS-PnP-DIP. In a series of experiments with real data, It is shown that +the LRS-PnP-DIP either achieves state-of-the-art inpainting performance +compared to other learning-based methods, or outperforms them. + +
+
+ comment: Presented in ISCS23 +
+
+
+
+
+ + ♻ ☆ Modeling Events and Interactions through Temporal Processes -- A Survey + + +
+ In real-world scenario, many phenomena produce a collection of events that +occur in continuous time. Point Processes provide a natural mathematical +framework for modeling these sequences of events. In this survey, we +investigate probabilistic models for modeling event sequences through temporal +processes. We revise the notion of event modeling and provide the mathematical +foundations that characterize the literature on the topic. We define an +ontology to categorize the existing approaches in terms of three families: +simple, marked, and spatio-temporal point processes. For each family, we +systematically review the existing approaches based based on deep learning. +Finally, we analyze the scenarios where the proposed techniques can be used for +addressing prediction and modeling aspects. + +
+
+ comment: Image replacements +
+
+
+
+
+ + ♻ ☆ Learning Neural PDE Solvers with Parameter-Guided Channel Attention ICML2023 + + +
+ Scientific Machine Learning (SciML) is concerned with the development of +learned emulators of physical systems governed by partial differential +equations (PDE). In application domains such as weather forecasting, molecular +dynamics, and inverse design, ML-based surrogate models are increasingly used +to augment or replace inefficient and often non-differentiable numerical +simulation algorithms. While a number of ML-based methods for approximating the +solutions of PDEs have been proposed in recent years, they typically do not +adapt to the parameters of the PDEs, making it difficult to generalize to PDE +parameters not seen during training. We propose a Channel Attention mechanism +guided by PDE Parameter Embeddings (CAPE) component for neural surrogate models +and a simple yet effective curriculum learning strategy. The CAPE module can be +combined with neural PDE solvers allowing them to adapt to unseen PDE +parameters. The curriculum learning strategy provides a seamless transition +between teacher-forcing and fully auto-regressive training. We compare CAPE in +conjunction with the curriculum learning strategy using a popular PDE benchmark +and obtain consistent and significant improvements over the baseline models. +The experiments also show several advantages of CAPE, such as its increased +ability to generalize to unseen PDE parameters without large increases +inference time and parameter count. + +
+
+ comment: accepted for publication in ICML2023 +
+
+
+
+
+ + ♻ ☆ Continual Learning for Abdominal Multi-Organ and Tumor Segmentation MICCAI-2023 + + +
+ The ability to dynamically extend a model to new data and classes is critical +for multiple organ and tumor segmentation. However, due to privacy regulations, +accessing previous data and annotations can be problematic in the medical +domain. This poses a significant barrier to preserving the high segmentation +accuracy of the old classes when learning from new classes because of the +catastrophic forgetting problem. In this paper, we first empirically +demonstrate that simply using high-quality pseudo labels can fairly mitigate +this problem in the setting of organ segmentation. Furthermore, we put forward +an innovative architecture designed specifically for continuous organ and tumor +segmentation, which incurs minimal computational overhead. Our proposed design +involves replacing the conventional output layer with a suite of lightweight, +class-specific heads, thereby offering the flexibility to accommodate newly +emerging classes. These heads enable independent predictions for newly +introduced and previously learned classes, effectively minimizing the impact of +new classes on old ones during the course of continual learning. We further +propose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings +into the organ-specific heads. These embeddings encapsulate the semantic +information of each class, informed by extensive image-text co-training. The +proposed method is evaluated on both in-house and public abdominal CT datasets +under organ and tumor segmentation tasks. Empirical results suggest that the +proposed design improves the segmentation performance of a baseline neural +network on newly-introduced and previously-learned classes along the learning +trajectory. + +
+
+ comment: MICCAI-2023 +
+
+
+
+
+ + ♻ ☆ Quantitative CLTs in Deep Neural Networks + + +
+ We study the distribution of a fully connected neural network with random +Gaussian weights and biases in which the hidden layer widths are proportional +to a large constant $n$. Under mild assumptions on the non-linearity, we obtain +quantitative bounds on normal approximations valid at large but finite $n$ and +any fixed network depth. Our theorems show both for the finite-dimensional +distributions and the entire process, that the distance between a random fully +connected network (and its derivatives) to the corresponding infinite width +Gaussian process scales like $n^{-\gamma}$ for $\gamma>0$, with the exponent +depending on the metric used to measure discrepancy. Our bounds are strictly +stronger in terms of their dependence on network width than any previously +available in the literature; in the one-dimensional case, we also prove that +they are optimal, i.e., we establish matching lower bounds. + +
+
+
+
+
+ + ♻ ☆ Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques + + +
+ In the contemporary digital landscape, online reviews have become an +indispensable tool for promoting products and services across various +businesses. Marketers, advertisers, and online businesses have found incentives +to create deceptive positive reviews for their products and negative reviews +for their competitors' offerings. As a result, the writing of deceptive reviews +has become an unavoidable practice for businesses seeking to promote themselves +or undermine their rivals. Detecting such deceptive reviews has become an +intense and ongoing area of research. This research paper proposes a machine +learning model to identify deceptive reviews, with a particular focus on +restaurants. This study delves into the performance of numerous experiments +conducted on a dataset of restaurant reviews known as the Deceptive Opinion +Spam Corpus. To accomplish this, an n-gram model and max features are developed +to effectively identify deceptive content, particularly focusing on fake +reviews. A benchmark study is undertaken to explore the performance of two +different feature extraction techniques, which are then coupled with five +distinct machine learning classification algorithms. The experimental results +reveal that the passive aggressive classifier stands out among the various +algorithms, showcasing the highest accuracy not only in text classification but +also in identifying fake reviews. Moreover, the research delves into data +augmentation and implements various deep learning techniques to further enhance +the process of detecting deceptive reviews. The findings shed light on the +efficacy of the proposed machine learning approach and offer valuable insights +into dealing with deceptive reviews in the realm of online businesses. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Bayesian taut splines for estimating the number of modes + + +
+ The number of modes in a probability density function is representative of +the model's complexity and can also be viewed as the number of existing +subpopulations. Despite its relevance, little research has been devoted to its +estimation. Focusing on the univariate setting, we propose a novel approach +targeting prediction accuracy inspired by some overlooked aspects of the +problem. We argue for the need for structure in the solutions, the subjective +and uncertain nature of modes, and the convenience of a holistic view blending +global and local density properties. Our method builds upon a combination of +flexible kernel estimators and parsimonious compositional splines. Feature +exploration, model selection and mode testing are implemented in the Bayesian +inference paradigm, providing soft solutions and allowing to incorporate expert +judgement in the process. The usefulness of our proposal is illustrated through +a case study in sports analytics, showcasing multiple companion visualisation +tools. A thorough simulation study demonstrates that traditional +modality-driven approaches paradoxically struggle to provide accurate results. +In this context, our method emerges as a top-tier alternative offering +innovative solutions for analysts. + +
+
+ comment: 20 pages, 8 figures (manuscript) + 19 pages, 16 figures + (supplementary material) +
+
+
+
+
+ + ♻ ☆ Confidence intervals for performance estimates in 3D medical image + segmentation + + +
+ Medical segmentation models are evaluated empirically. As such an evaluation +is based on a limited set of example images, it is unavoidably noisy. Beyond a +mean performance measure, reporting confidence intervals is thus crucial. +However, this is rarely done in medical image segmentation. The width of the +confidence interval depends on the test set size and on the spread of the +performance measure (its standard-deviation across of the test set). For +classification, many test images are needed to avoid wide confidence intervals. +Segmentation, however, has not been studied, and it differs by the amount of +information brought by a given test image. In this paper, we study the typical +confidence intervals in medical image segmentation. We carry experiments on 3D +image segmentation using the standard nnU-net framework, two datasets from the +Medical Decathlon challenge and two performance measures: the Dice accuracy and +the Hausdorff distance. We show that the parametric confidence intervals are +reasonable approximations of the bootstrap estimates for varying test set sizes +and spread of the performance metric. Importantly, we show that the test size +needed to achieve a given precision is often much lower than for classification +tasks. Typically, a 1% wide confidence interval requires about 100-200 test +samples when the spread is low (standard-deviation around 3%). More difficult +segmentation tasks may lead to higher spreads and require over 1000 samples. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ SegNetr: Rethinking the local-global interactions and skip connections + in U-shaped networks + + +
+ Recently, U-shaped networks have dominated the field of medical image +segmentation due to their simple and easily tuned structure. However, existing +U-shaped segmentation networks: 1) mostly focus on designing complex +self-attention modules to compensate for the lack of long-term dependence based +on convolution operation, which increases the overall number of parameters and +computational complexity of the network; 2) simply fuse the features of encoder +and decoder, ignoring the connection between their spatial locations. In this +paper, we rethink the above problem and build a lightweight medical image +segmentation network, called SegNetr. Specifically, we introduce a novel +SegNetr block that can perform local-global interactions dynamically at any +stage and with only linear complexity. At the same time, we design a general +information retention skip connection (IRSC) to preserve the spatial location +information of encoder features and achieve accurate fusion with the decoder +features. We validate the effectiveness of SegNetr on four mainstream medical +image segmentation datasets, with 59\% and 76\% fewer parameters and GFLOPs +than vanilla U-Net, while achieving segmentation performance comparable to +state-of-the-art methods. Notably, the components proposed in this paper can be +applied to other U-shaped networks to improve their segmentation performance. + +
+
+
+
+
+ + ♻ ☆ Shortcut Detection with Variational Autoencoders ICML 2023 + + +
+ For real-world applications of machine learning (ML), it is essential that +models make predictions based on well-generalizing features rather than +spurious correlations in the data. The identification of such spurious +correlations, also known as shortcuts, is a challenging problem and has so far +been scarcely addressed. In this work, we present a novel approach to detect +shortcuts in image and audio datasets by leveraging variational autoencoders +(VAEs). The disentanglement of features in the latent space of VAEs allows us +to discover feature-target correlations in datasets and semi-automatically +evaluate them for ML shortcuts. We demonstrate the applicability of our method +on several real-world datasets and identify shortcuts that have not been +discovered before. + +
+
+ comment: Accepted at the ICML 2023 Workshop on Spurious Correlations, + Invariance and Stability +
+
+
+
+
+ + ♻ ☆ MedNeXt: Transformer-driven Scaling of ConvNets for Medical Image + Segmentation MICCAI 2023 + + +
+ There has been exploding interest in embracing Transformer-based +architectures for medical image segmentation. However, the lack of large-scale +annotated medical datasets make achieving performances equivalent to those in +natural images challenging. Convolutional networks, in contrast, have higher +inductive biases and consequently, are easily trainable to high performance. +Recently, the ConvNeXt architecture attempted to modernize the standard ConvNet +by mirroring Transformer blocks. In this work, we improve upon this to design a +modernized and scalable convolutional architecture customized to challenges of +data-scarce medical settings. We introduce MedNeXt, a Transformer-inspired +large kernel segmentation network which introduces - 1) A fully ConvNeXt 3D +Encoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up +and downsampling blocks to preserve semantic richness across scales, 3) A novel +technique to iteratively increase kernel sizes by upsampling small kernel +networks, to prevent performance saturation on limited medical data, 4) +Compound scaling at multiple levels (depth, width, kernel size) of MedNeXt. +This leads to state-of-the-art performance on 4 tasks on CT and MRI modalities +and varying dataset sizes, representing a modernized deep architecture for +medical image segmentation. Our code is made publicly available at: +https://github.com/MIC-DKFZ/MedNeXt. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Is Your Model "MADD"? A Novel Metric to Evaluate Algorithmic Fairness + for Predictive Student Models + + +
+ Predictive student models are increasingly used in learning environments due +to their ability to enhance educational outcomes and support stakeholders in +making informed decisions. However, predictive models can be biased and produce +unfair outcomes, leading to potential discrimination against some students and +possible harmful long-term implications. This has prompted research on fairness +metrics meant to capture and quantify such biases. Nonetheless, so far, +existing fairness metrics used in education are predictive +performance-oriented, focusing on assessing biased outcomes across groups of +students, without considering the behaviors of the models nor the severity of +the biases in the outcomes. Therefore, we propose a novel metric, the Model +Absolute Density Distance (MADD), to analyze models' discriminatory behaviors +independently from their predictive performance. We also provide a +complementary visualization-based analysis to enable fine-grained human +assessment of how the models discriminate between groups of students. We +evaluate our approach on the common task of predicting student success in +online courses, using several common predictive classification models on an +open educational dataset. We also compare our metric to the only predictive +performance-oriented fairness metric developed in education, ABROCA. Results on +this dataset show that: (1) fair predictive performance does not guarantee fair +models' behaviors and thus fair outcomes, (2) there is no direct relationship +between data bias and predictive performance bias nor discriminatory behaviors +bias, and (3) trained on the same data, models exhibit different discriminatory +behaviors, according to different sensitive features too. We thus recommend +using the MADD on models that show satisfying predictive performance, to gain a +finer-grained understanding on how they behave and to refine models selection +and their usage. + +
+
+ comment: 12 pages, conference +
+
+
+
+
+ + ♻ ☆ Deep learning based Meta-modeling for Multi-objective Technology + Optimization of Electrical Machines + + +
+ Optimization of rotating electrical machines is both time- and +computationally expensive. Because of the different parametrization, design +optimization is commonly executed separately for each machine technology. In +this paper, we present the application of a variational auto-encoder (VAE) to +optimize two different machine technologies simultaneously, namely an +asynchronous machine and a permanent magnet synchronous machine. After +training, we employ a deep neural network and a decoder as meta-models to +predict global key performance indicators (KPIs) and generate associated new +designs, respectively, through unified latent space in the optimization loop. +Numerical results demonstrate concurrent parametric multi-objective technology +optimization in the high-dimensional design space. The VAE-based approach is +quantitatively compared to a classical deep learning-based direct approach for +KPIs prediction. + +
+
+ comment: 12 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ IsoEx: an explainable unsupervised approach to process event logs cyber + investigation + + +
+ 39 seconds. That is the timelapse between two consecutive cyber attacks as of +2023. Meaning that by the time you are done reading this abstract, about 1 or 2 +additional cyber attacks would have occurred somewhere in the world. In this +context of highly increased frequency of cyber threats, Security Operation +Centers (SOC) and Computer Emergency Response Teams (CERT) can be overwhelmed. +In order to relieve the cybersecurity teams in their investigative effort and +help them focus on more added-value tasks, machine learning approaches and +methods started to emerge. This paper introduces a novel method, IsoEx, for +detecting anomalous and potentially problematic command lines during the +investigation of contaminated devices. IsoEx is built around a set of features +that leverages the log structure of the command line, as well as its +parent/child relationship, to achieve a greater accuracy than traditional +methods. To detect anomalies, IsoEx resorts to an unsupervised anomaly +detection technique that is both highly sensitive and lightweight. A key +contribution of the paper is its emphasis on interpretability, achieved through +the features themselves and the application of eXplainable Artificial +Intelligence (XAI) techniques and visualizations. This is critical to ensure +the adoption of the method by SOC and CERT teams, as the paper argues that the +current literature on machine learning for log investigation has not adequately +addressed the issue of explainability. This method was proven efficient in a +real-life environment as it was built to support a company\'s SOC and CERT + +
+
+
+
+
+ + ♻ ☆ Sound Demixing Challenge 2023 Music Demixing Track Technical Report: + TFC-TDF-UNet v3 + + +
+ In this report, we present our award-winning solutions for the Music Demixing +Track of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a +time-efficient music source separation model that achieves state-of-the-art +results on the MUSDB benchmark. We then give full details regarding our +solutions for each Leaderboard, including a loss masking approach for +noise-robust training. Code for reproducing model training and final +submissions is available at github.com/kuielab/sdx23. + +
+
+ comment: 5 pages, 4 tables +
+
+
+
+
+ + ♻ ☆ Editable User Profiles for Controllable Text Recommendation SIGIR-2023 + + +
+ Methods for making high-quality recommendations often rely on learning latent +representations from interaction data. These methods, while performant, do not +provide ready mechanisms for users to control the recommendation they receive. +Our work tackles this problem by proposing LACE, a novel concept value +bottleneck model for controllable text recommendations. LACE represents each +user with a succinct set of human-readable concepts through retrieval given +user-interacted documents and learns personalized representations of the +concepts based on user documents. This concept based user profile is then +leveraged to make recommendations. The design of our model affords control over +the recommendations through a number of intuitive interactions with a +transparent user profile. We first establish the quality of recommendations +obtained from LACE in an offline evaluation on three recommendation tasks +spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we +validate the controllability of LACE under simulated user interactions. +Finally, we implement LACE in an interactive controllable recommender system +and conduct a user study to demonstrate that users are able to improve the +quality of recommendations they receive through interactions with an editable +user profile. + +
+
+ comment: SIGIR-2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces ICCV 2023 + + +
+ Recent advances in face manipulation using StyleGAN have produced impressive +results. However, StyleGAN is inherently limited to cropped aligned faces at a +fixed image resolution it is pre-trained on. In this paper, we propose a simple +and effective solution to this limitation by using dilated convolutions to +rescale the receptive fields of shallow layers in StyleGAN, without altering +any model parameters. This allows fixed-size small features at shallow layers +to be extended into larger ones that can accommodate variable resolutions, +making them more robust in characterizing unaligned faces. To enable real face +inversion and manipulation, we introduce a corresponding encoder that provides +the first-layer feature of the extended StyleGAN in addition to the latent +style code. We validate the effectiveness of our method using unaligned face +inputs of various resolutions in a diverse set of face manipulation tasks, +including facial attribute editing, super-resolution, sketch/mask-to-face +translation, and face toonification. + +
+
+ comment: ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX + Project page: https://www.mmlab-ntu.com/project/styleganex/ +
+
+
+
+
+ + ♻ ☆ Multi-scale Attention Flow for Probabilistic Time Series Forecasting + + +
+ The probability prediction of multivariate time series is a notoriously +challenging but practical task. On the one hand, the challenge is how to +effectively capture the cross-series correlations between interacting time +series, to achieve accurate distribution modeling. On the other hand, we should +consider how to capture the contextual information within time series more +accurately to model multivariate temporal dynamics of time series. In this +work, we proposed a novel non-autoregressive deep learning model, called +Multi-scale Attention Normalizing Flow(MANF), where we integrate multi-scale +attention and relative position information and the multivariate data +distribution is represented by the conditioned normalizing flow. Additionally, +compared with autoregressive modeling methods, our model avoids the influence +of cumulative error and does not increase the time complexity. Extensive +experiments demonstrate that our model achieves state-of-the-art performance on +many popular multivariate datasets. + +
+
+
+
+
+ + ♻ ☆ MolFM: A Multimodal Molecular Foundation Model + + +
+ Molecular knowledge resides within three different modalities of information +sources: molecular structures, biomedical documents, and knowledge bases. +Effective incorporation of molecular knowledge from these modalities holds +paramount significance in facilitating biomedical research. However, existing +multimodal molecular foundation models exhibit limitations in capturing +intricate connections between molecular structures and texts, and more +importantly, none of them attempt to leverage a wealth of molecular expertise +derived from knowledge graphs. In this study, we introduce MolFM, a multimodal +molecular foundation model designed to facilitate joint representation learning +from molecular structures, biomedical texts, and knowledge graphs. We propose +cross-modal attention between atoms of molecular structures, neighbors of +molecule entities and semantically related texts to facilitate cross-modal +comprehension. We provide theoretical analysis that our cross-modal +pre-training captures local and global molecular knowledge by minimizing the +distance in the feature space between different modalities of the same +molecule, as well as molecules sharing similar structures or functions. MolFM +achieves state-of-the-art performance on various downstream tasks. On +cross-modal retrieval, MolFM outperforms existing models with 12.13% and 5.04% +absolute gains under the zero-shot and fine-tuning settings, respectively. +Furthermore, qualitative analysis showcases MolFM's implicit ability to provide +grounding from molecular substructures and knowledge graphs. Code and models +are available on https://github.com/BioFM/OpenBioMed. + +
+
+ comment: 31 pages, 15 figures, and 15 tables +
+
+
+
+
+ + ♻ ☆ Is Homophily a Necessity for Graph Neural Networks? + + +
+ Graph neural networks (GNNs) have shown great prowess in learning +representations suitable for numerous graph-based machine learning tasks. When +applied to semi-supervised node classification, GNNs are widely believed to +work well due to the homophily assumption ("like attracts like"), and fail to +generalize to heterophilous graphs where dissimilar nodes connect. Recent works +design new architectures to overcome such heterophily-related limitations, +citing poor baseline performance and new architecture improvements on a few +heterophilous graph benchmark datasets as evidence for this notion. In our +experiments, we empirically find that standard graph convolutional networks +(GCNs) can actually achieve better performance than such carefully designed +methods on some commonly used heterophilous graphs. This motivates us to +reconsider whether homophily is truly necessary for good GNN performance. We +find that this claim is not quite true, and in fact, GCNs can achieve strong +performance on heterophilous graphs under certain conditions. Our work +carefully characterizes these conditions, and provides supporting theoretical +understanding and empirical observations. Finally, we examine existing +heterophilous graphs benchmarks and reconcile how the GCN (under)performs on +them based on this understanding. + +
+
+
+
+
+ + ♻ ☆ Simplifying Momentum-based Positive-definite Submanifold Optimization + with Applications to Deep Learning ICML 2023 + + +
+ Riemannian submanifold optimization with momentum is computationally +challenging because, to ensure that the iterates remain on the submanifold, we +often need to solve difficult differential equations. Here, we simplify such +difficulties for a class of structured symmetric positive-definite matrices +with the affine-invariant metric. We do so by proposing a generalized version +of the Riemannian normal coordinates that dynamically orthonormalizes the +metric and locally converts the problem into an unconstrained problem in the +Euclidean space. We use our approach to simplify existing approaches for +structured covariances and develop matrix-inverse-free $2^\text{nd}$-order +optimizers for deep learning with low precision by using only matrix +multiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL + +
+
+ comment: An updated version of the ICML 2023 paper. Updated the main text and + added more numerical results for DNNs including a new baseline method and + improving existing baseline methods +
+
+
+
+
+ + ♻ ☆ The activity-weight duality in feed forward neural networks: The + geometric determinants of generalization + + +
+ One of the fundamental problems in machine learning is generalization. In +neural network models with a large number of weights (parameters), many +solutions can be found to fit the training data equally well. The key question +is which solution can describe testing data not in the training set. Here, we +report the discovery of an exact duality (equivalence) between changes in +activities in a given layer of neurons and changes in weights that connect to +the next layer of neurons in a densely connected layer in any feed forward +neural network. The activity-weight (A-W) duality allows us to map variations +in inputs (data) to variations of the corresponding dual weights. By using this +mapping, we show that the generalization loss can be decomposed into a sum of +contributions from different eigen-directions of the Hessian matrix of the loss +function at the solution in weight space. The contribution from a given +eigen-direction is the product of two geometric factors (determinants): the +sharpness of the loss landscape and the standard deviation of the dual weights, +which is found to scale with the weight norm of the solution. Our results +provide an unified framework, which we used to reveal how different +regularization schemes (weight decay, stochastic gradient descent with +different batch sizes and learning rates, dropout), training data size, and +labeling noise affect generalization performance by controlling either one or +both of these two geometric determinants for generalization. These insights can +be used to guide development of algorithms for finding more generalizable +solutions in overparametrized neural networks. + +
+
+
+
+
+ + ♻ ☆ SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning + + +
+ SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to +protect data privacy in vertical federated learning setting. It is widely used +in fields such as finance and healthcare due to its interpretability, +effectiveness, and privacy-preserving capability. However, SecureBoost suffers +from high computational complexity and risk of label leakage. To harness the +full potential of SecureBoost, hyperparameters of SecureBoost should be +carefully chosen to strike an optimal balance between utility, efficiency, and +privacy. Existing methods either set hyperparameters empirically or +heuristically, which are far from optimal. To fill this gap, we propose a +Constrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto +optimal solutions that each solution is a set of hyperparameters achieving +optimal tradeoff between utility loss, training cost, and privacy leakage. We +design measurements of the three objectives. In particular, the privacy leakage +is measured using our proposed instance clustering attack. Experimental results +demonstrate that the CMOSB yields not only hyperparameters superior to the +baseline but also optimal sets of hyperparameters that can support the flexible +requirements of FL participants. + +
+
+ comment: FL-ICAI'23 +
+
+
+
+
+ + ♻ ☆ Factoring the Matrix of Domination: A Critical Review and Reimagination + of Intersectionality in AI Fairness + + +
+ Intersectionality is a critical framework that, through inquiry and praxis, +allows us to examine how social inequalities persist through domains of +structure and discipline. Given AI fairness' raison d'etre of "fairness", we +argue that adopting intersectionality as an analytical framework is pivotal to +effectively operationalizing fairness. Through a critical review of how +intersectionality is discussed in 30 papers from the AI fairness literature, we +deductively and inductively: 1) map how intersectionality tenets operate within +the AI fairness paradigm and 2) uncover gaps between the conceptualization and +operationalization of intersectionality. We find that researchers +overwhelmingly reduce intersectionality to optimizing for fairness metrics over +demographic subgroups. They also fail to discuss their social context and when +mentioning power, they mostly situate it only within the AI pipeline. We: 3) +outline and assess the implications of these gaps for critical inquiry and +praxis, and 4) provide actionable recommendations for AI fairness researchers +to engage with intersectionality in their work by grounding it in AI +epistemology. + +
+
+ comment: To appear at AIES 2023 +
+
+
+
+
+ + ♻ ☆ Invariant Slot Attention: Object Discovery with Slot-Centric Reference + Frames ICML 2023 + + +
+ Automatically discovering composable abstractions from raw perceptual data is +a long-standing challenge in machine learning. Recent slot-based neural +networks that learn about objects in a self-supervised manner have made +exciting progress in this direction. However, they typically fall short at +adequately capturing spatial symmetries present in the visual world, which +leads to sample inefficiency, such as when entangling object appearance and +pose. In this paper, we present a simple yet highly effective method for +incorporating spatial symmetries via slot-centric reference frames. We +incorporate equivariance to per-object pose transformations into the attention +and generation mechanism of Slot Attention by translating, scaling, and +rotating position encodings. These changes result in little computational +overhead, are easy to implement, and can result in large gains in terms of data +efficiency and overall improvements to object discovery. We evaluate our method +on a wide range of synthetic object discovery benchmarks namely CLEVR, +Tetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising +improvements on the challenging real-world Waymo Open dataset. + +
+
+ comment: Accepted at ICML 2023. Project page: https://invariantsa.github.io/ +
+
+
+
+
+ + ♻ ☆ Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency + Analysis + + +
+ Graph neural networks (GNNs) are among the most powerful tools in deep +learning. They routinely solve complex problems on unstructured networks, such +as node classification, graph classification, or link prediction, with high +accuracy. However, both inference and training of GNNs are complex, and they +uniquely combine the features of irregular graph processing with dense and +regular computations. This complexity makes it very challenging to execute GNNs +efficiently on modern massively parallel architectures. To alleviate this, we +first design a taxonomy of parallelism in GNNs, considering data and model +parallelism, and different forms of pipelining. Then, we use this taxonomy to +investigate the amount of parallelism in numerous GNN models, GNN-driven +machine learning tasks, software frameworks, or hardware accelerators. We use +the work-depth model, and we also assess communication volume and +synchronization. We specifically focus on the sparsity/density of the +associated tensors, in order to understand how to effectively apply techniques +such as vectorization. We also formally analyze GNN pipelining, and we +generalize the established Message-Passing class of GNN models to cover +arbitrary pipeline depths, facilitating future optimizations. Finally, we +investigate different forms of asynchronicity, navigating the path for future +asynchronous parallel GNN pipelines. The outcomes of our analysis are +synthesized in a set of insights that help to maximize GNN performance, and a +comprehensive list of challenges and opportunities for further research into +efficient GNN computations. Our work will help to advance the design of future +GNNs. + +
+
+
+
+
+ + ♻ ☆ Asynchronous Multi-Model Dynamic Federated Learning over Wireless + Networks: Theory, Modeling, and Optimization + + +
+ Federated learning (FL) has emerged as a key technique for distributed +machine learning (ML). Most literature on FL has focused on ML model training +for (i) a single task/model, with (ii) a synchronous scheme for uplink/downlink +transfer of model parameters, and (iii) a static data distribution setting +across devices. These assumptions are often not well representative of +conditions encountered in practical FL environments. To address this, we +develop DMA-FL, which considers dynamic FL with multiple downstream tasks to be +trained over an asynchronous model transmission architecture. We first +characterize the convergence of ML model training under DMA-FL via introducing +a family of scheduling tensors and rectangular functions to capture the +scheduling of devices. Our convergence analysis sheds light on the impact of +resource allocation, device scheduling, and individual model states on the +performance of ML models. We then formulate a non-convex mixed integer +optimization problem for jointly configuring the resource allocation and device +scheduling to strike an efficient trade-off between energy consumption and ML +performance. We develop a solution methodology employing successive convex +approximations with convergence guarantee to a stationary point. Through +numerical simulations, we reveal the advantages of DMA-FL in terms of model +performance and network resource savings. + +
+
+ comment: Submission to IEEE Transactions on Cognitive Communications and + Networking +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ♻ ☆ VERITE: A Robust Benchmark for Multimodal Misinformation Detection + Accounting for Unimodal Bias + + +
+ Multimedia content has become ubiquitous on social media platforms, leading +to the rise of multimodal misinformation (MM) and the urgent need for effective +strategies to detect and prevent its spread. In recent years, the challenge of +multimodal misinformation detection (MMD) has garnered significant attention by +researchers and has mainly involved the creation of annotated, weakly +annotated, or synthetically generated training datasets, along with the +development of various deep learning MMD models. However, the problem of +unimodal bias in MMD benchmarks -- where biased or unimodal methods outperform +their multimodal counterparts on an inherently multimodal task -- has been +overlooked. In this study, we systematically investigate and identify the +presence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS), +raising concerns about their suitability for reliable evaluation. To address +this issue, we introduce the "VERification of Image-TExtpairs" (VERITE) +benchmark for MMD which incorporates real-world data, excludes "asymmetric +multimodal misinformation" and utilizes "modality balancing". We conduct an +extensive comparative study with a Transformer-based architecture that shows +the ability of VERITE to effectively address unimodal bias, rendering it a +robust evaluation framework for MMD. Furthermore, we introduce a new method -- +termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating +realistic synthetic training data that preserve crossmodal relations between +legitimate images and false human-written captions. By leveraging CHASMA in the +training process, we observe consistent and notable improvements in predictive +performance on VERITE; with a 9.2% increase in accuracy. We release our code +at: https://github.com/stevejpapad/image-text-verification + +
+
+
+
+
+ + ♻ ☆ Sound Demixing Challenge 2023 Music Demixing Track Technical Report: + TFC-TDF-UNet v3 + + +
+ In this report, we present our award-winning solutions for the Music Demixing +Track of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a +time-efficient music source separation model that achieves state-of-the-art +results on the MUSDB benchmark. We then give full details regarding our +solutions for each Leaderboard, including a loss masking approach for +noise-robust training. Code for reproducing model training and final +submissions is available at github.com/kuielab/sdx23. + +
+
+ comment: 5 pages, 4 tables +
+
+
+
+
+ + ♻ ☆ Dynamic Storyboard Generation in an Engine-based Virtual Environment for + Video Production + + +
+ Amateurs working on mini-films and short-form videos usually spend lots of +time and effort on the multi-round complicated process of setting and adjusting +scenes, plots, and cameras to deliver satisfying video shots. We present +Virtual Dynamic Storyboard (VDS) to allow users storyboarding shots in virtual +environments, where the filming staff can easily test the settings of shots +before the actual filming. VDS runs on a "propose-simulate-discriminate" mode: +Given a formatted story script and a camera script as input, it generates +several character animation and camera movement proposals following predefined +story and cinematic rules to allow an off-the-shelf simulation engine to render +videos. To pick up the top-quality dynamic storyboard from the candidates, we +equip it with a shot ranking discriminator based on shot quality criteria +learned from professional manual-created data. VDS is comprehensively validated +via extensive experiments and user studies, demonstrating its efficiency, +effectiveness, and great potential in assisting amateur video production. + +
+
+ comment: Project page: https://virtualfilmstudio.github.io/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 61 + +
+
+
+ + ☆ L-Eval: Instituting Standardized Evaluation for Long Context Language + Models + + +
+ Recently, there has been growing interest in extending the context length of +instruction-following models in order to effectively process single-turn long +input (e.g. summarizing a paper) and conversations with more extensive +histories. While proprietary models such as GPT-4 and Claude have demonstrated +considerable advancements in handling tens of thousands of tokens of context, +open-sourced models are still in the early stages of experimentation. It also +remains unclear whether developing these long context models can offer +substantial gains on practical downstream tasks over retrieval-based methods or +models simply trained on chunked contexts. To address this challenge, we +propose to institute standardized evaluation for long context language models. +Concretely, we develop L-Eval which contains 411 long documents and over 2,000 +query-response pairs manually annotated and checked by the authors encompassing +areas such as law, finance, school lectures, lengthy conversations, news, +long-form novels, and meetings. L-Eval also adopts diverse evaluation methods +and instruction styles, enabling a more reliable assessment of Long Context +Language Models (LCLMs). Our findings indicate that while open-source models +typically lag behind their commercial counterparts, they still exhibit +impressive performance. LLaMA2 achieves the best results (win 45\% vs +turbo-16k) on open-ended tasks with only 4k context length and ChatGLM2 +achieves the best results on closed-ended tasks with 8k input tokens. We +release our new evaluation suite, code, and all generation results including +predictions from all open-sourced LCLMs, GPT4-32k, Cluade-100k at +{\url{https://github.com/OpenLMLab/LEval}}. + +
+
+
+
+
+ + ☆ Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot + Classification + + +
+ Recent work has shown that language models' (LMs) prompt-based learning +capabilities make them well suited for automating data labeling in domains +where manual annotation is expensive. The challenge is that while writing an +initial prompt is cheap, improving a prompt is costly -- practitioners often +require significant labeled data in order to evaluate the impact of prompt +modifications. Our work asks whether it is possible to improve prompt-based +learning without additional labeled data. We approach this problem by +attempting to modify the predictions of a prompt, rather than the prompt +itself. Our intuition is that accurate predictions should also be consistent: +samples which are similar under some feature representation should receive the +same prompt prediction. We propose Embroid, a method which computes multiple +representations of a dataset under different embedding functions, and uses the +consistency between the LM predictions for neighboring samples to identify +mispredictions. Embroid then uses these neighborhoods to create additional +predictions for each sample, and combines these predictions with a simple +latent variable graphical model in order to generate a final corrected +prediction. In addition to providing a theoretical analysis of Embroid, we +conduct a rigorous empirical evaluation across six different LMs and up to 95 +different tasks. We find that (1) Embroid substantially improves performance +over original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also +realizes improvements for more sophisticated prompting strategies (e.g., +chain-of-thought), and (3) can be specialized to domains like law through the +embedding functions. + +
+
+ comment: 38 pages, 22 figures, 8 tables +
+
+
+
+
+ + ☆ "It Felt Like Having a Second Mind": Investigating Human-AI + Co-creativity in Prewriting with Large Language Models SC + + +
+ Prewriting is the process of discovering and developing ideas before a first +draft, which requires divergent thinking and often implies unstructured +strategies such as diagramming, outlining, free-writing, etc. Although large +language models (LLMs) have been demonstrated to be useful for a variety of +tasks including creative writing, little is known about how users would +collaborate with LLMs to support prewriting. The preferred collaborative role +and initiative of LLMs during such a creativity process is also unclear. To +investigate human-LLM collaboration patterns and dynamics during prewriting, we +conducted a three-session qualitative study with 15 participants in two +creative tasks: story writing and slogan writing. The findings indicated that +during collaborative prewriting, there appears to be a three-stage iterative +Human-AI Co-creativity process that includes Ideation, Illumination, and +Implementation stages. This collaborative process champions the human in a +dominant role, in addition to mixed and shifting levels of initiative that +exist between humans and LLMs. This research also reports on collaboration +breakdowns that occur during this process, user perceptions of using existing +LLMs during Human-AI Co-creativity, and discusses design implications to +support this co-creativity process. + +
+
+ comment: Under review at CSCW after a Major Revision +
+
+
+
+
+ + ☆ Investigating the Factual Knowledge Boundary of Large Language Models + with Retrieval Augmentation + + +
+ Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require +a substantial amount of factual knowledge and often rely on external +information for assistance. Recently, large language models (LLMs) (e.g., +ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks +with world knowledge, including knowledge-intensive tasks. However, it remains +unclear how well LLMs are able to perceive their factual knowledge boundaries, +particularly how they behave when incorporating retrieval augmentation. In this +study, we present an initial analysis of the factual knowledge boundaries of +LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially, +we focus on three primary research questions and analyze them by examining QA +performance, priori judgement and posteriori judgement of LLMs. We show +evidence that LLMs possess unwavering confidence in their capabilities to +respond to questions and the accuracy of their responses. Furthermore, +retrieval augmentation proves to be an effective approach in enhancing LLMs' +awareness of knowledge boundaries, thereby improving their judgemental +abilities. Additionally, we also find that LLMs have a propensity to rely on +the provided retrieval results when formulating answers, while the quality of +these results significantly impacts their reliance. The code to reproduce this +work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary. + +
+
+
+
+
+ + ☆ Integrating Pretrained ASR and LM to Perform Sequence Generation for + Spoken Language Understanding INTERSPEECH 2023 + + +
+ There has been an increased interest in the integration of pretrained speech +recognition (ASR) and language models (LM) into the SLU framework. However, +prior methods often struggle with a vocabulary mismatch between pretrained +models, and LM cannot be directly utilized as they diverge from its NLU +formulation. In this study, we propose a three-pass end-to-end (E2E) SLU system +that effectively integrates ASR and LM subnetworks into the SLU formulation for +sequence generation tasks. In the first pass, our architecture predicts ASR +transcripts using the ASR subnetwork. This is followed by the LM subnetwork, +which makes an initial SLU prediction. Finally, in the third pass, the +deliberation subnetwork conditions on representations from the ASR and LM +subnetworks to make the final prediction. Our proposed three-pass SLU system +shows improved performance over cascaded and E2E SLU models on two benchmark +SLU datasets, SLURP and SLUE, especially on acoustically challenging +utterances. + +
+
+ comment: Accepted at INTERSPEECH 2023 +
+
+
+
+
+ + ☆ MASR: Metadata Aware Speech Representation + + +
+ In the recent years, speech representation learning is constructed primarily +as a self-supervised learning (SSL) task, using the raw audio signal alone, +while ignoring the side-information that is often available for a given speech +recording. In this paper, we propose MASR, a Metadata Aware Speech +Representation learning framework, which addresses the aforementioned +limitations. MASR enables the inclusion of multiple external knowledge sources +to enhance the utilization of meta-data information. The external knowledge +sources are incorporated in the form of sample-level pair-wise similarity +matrices that are useful in a hard-mining loss. A key advantage of the MASR +framework is that it can be combined with any choice of SSL method. Using MASR +representations, we perform evaluations on several downstream tasks such as +language identification, speech recognition and other non-semantic tasks such +as speaker and emotion recognition. In these experiments, we illustrate +significant performance improvements for the MASR over other established +benchmarks. We perform a detailed analysis on the language identification task +to provide insights on how the proposed loss function enables the +representations to separate closely related languages. + +
+
+
+
+
+ + ☆ Identical and Fraternal Twins: Fine-Grained Semantic Contrastive + Learning of Sentence Representations ECAI2023 + + +
+ The enhancement of unsupervised learning of sentence representations has been +significantly achieved by the utility of contrastive learning. This approach +clusters the augmented positive instance with the anchor instance to create a +desired embedding space. However, relying solely on the contrastive objective +can result in sub-optimal outcomes due to its inability to differentiate subtle +semantic variations between positive pairs. Specifically, common data +augmentation techniques frequently introduce semantic distortion, leading to a +semantic margin between the positive pair. While the InfoNCE loss function +overlooks the semantic margin and prioritizes similarity maximization between +positive pairs during training, leading to the insensitive semantic +comprehension ability of the trained model. In this paper, we introduce a novel +Identical and Fraternal Twins of Contrastive Learning (named IFTCL) framework, +capable of simultaneously adapting to various positive pairs generated by +different augmentation techniques. We propose a \textit{Twins Loss} to preserve +the innate margin during training and promote the potential of data enhancement +in order to overcome the sub-optimal issue. We also present proof-of-concept +experiments combined with the contrastive objective to prove the validity of +the proposed Twins Loss. Furthermore, we propose a hippocampus queue mechanism +to restore and reuse the negative instances without additional calculation, +which further enhances the efficiency and performance of the IFCL. We verify +the IFCL framework on nine semantic textual similarity tasks with both English +and Chinese datasets, and the experimental results show that IFCL outperforms +state-of-the-art methods. + +
+
+ comment: This article has been accepted for publication in European Conference + on Artificial Intelligence (ECAI2023). 9 pages, 4 figures +
+
+
+
+
+ + ☆ MediaGPT : A Large Language Model Target Chinese Media + + +
+ The development of large language models (LLMs) has seen rapid progress in +recent years. One of the most widely used LLMs is the Generative Pre-trained +Transformer (GPT) series, which has been applied in various fields, including +the media domain. However, in practical applications, the differences between +the media's use cases and the general-purpose applications of LLMs have become +increasingly apparent, especially Chinese. As a result, there is a growing need +to develop LLM that are specifically tailored to the unique requirements of the +media domain. In this paper, we present MediaGPT, a large language model +training on variety of media data and addressing the practical needs of Chinese +media. We have designed a diverse set of task instruction types to cater to the +specific requirements of the domain. To further validate the effectiveness of +our proposed LLM, we have constructed unique datasets that are tailored to the +media domain and have also developed verification methods that are specifically +designed for generative-type tasks. By doing so, we aim to bridge the gap +between the general-purpose LLM and the requirements of the media domain, and +to pave the way for more effective and efficient use of LLM in this field. This +paper aims to explore the challenges and opportunities of developing LLM for +media applications and to propose potential solutions for addressing these +challenges. + +
+
+
+
+
+ + ☆ FLASK: Fine-grained Language Model Evaluation based on Alignment Skill + Sets + + +
+ Evaluation of Large Language Models (LLMs) is challenging because aligning to +human values requires the composition of multiple skills and the required set +of skills varies depending on the instruction. Recent studies have evaluated +the performance of LLMs in two ways, (1) automatic evaluation on several +independent benchmarks and (2) human or machined-based evaluation giving an +overall score to the response. However, both settings are coarse-grained +evaluations, not considering the nature of user instructions that require +instance-wise skill composition, which limits the interpretation of the true +capabilities of LLMs. In this paper, we introduce FLASK (Fine-grained Language +Model Evaluation based on Alignment SKill Sets), a fine-grained evaluation +protocol that can be used for both model-based and human-based evaluation which +decomposes coarse-level scoring to an instance-wise skill set-level. +Specifically, we define 12 fine-grained skills needed for LLMs to follow +open-ended user instructions and construct an evaluation set by allocating a +set of skills for each instance. Additionally, by annotating the target domains +and difficulty level for each instance, FLASK provides a holistic view with a +comprehensive analysis of a model's performance depending on skill, domain, and +difficulty. Through using FLASK, we compare multiple open-sourced and +proprietary LLMs and observe highly-correlated findings between model-based and +human-based evaluations. FLASK enables developers to more accurately measure +the model performance and how it can be improved by analyzing factors that make +LLMs proficient in particular skills. For practitioners, FLASK can be used to +recommend suitable models for particular situations through comprehensive +comparison among various LLMs. We release the evaluation data and code +implementation at https://github.com/kaistAI/FLASK. + +
+
+
+
+
+ + ☆ FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with + Human Feedback + + +
+ Captions are crucial for understanding scientific visualizations and +documents. Existing captioning methods for scientific figures rely on +figure-caption pairs extracted from documents for training, many of which fall +short with respect to metrics like helpfulness, explainability, and +visual-descriptiveness [15] leading to generated captions being misaligned with +reader preferences. To enable the generation of high-quality figure captions, +we introduce FigCaps-HF a new framework for figure-caption generation that can +incorporate domain expert feedback in generating captions optimized for reader +preferences. Our framework comprises of 1) an automatic method for evaluating +quality of figure-caption pairs, 2) a novel reinforcement learning with human +feedback (RLHF) method to optimize a generative figure-to-caption model for +reader preferences. We demonstrate the effectiveness of our simple learning +framework by improving performance over standard fine-tuning across different +types of models. In particular, when using BLIP as the base model, our RLHF +framework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and +Meteor, respectively. Finally, we release a large-scale benchmark dataset with +human feedback on figure-caption pairs to enable further evaluation and +development of RLHF techniques for this problem. + +
+
+ comment: 19 pages, 4 figures. Benchmark Documentation: + https://figcapshf.github.io/ +
+
+
+
+
+ + ☆ Divide & Bind Your Attention for Improved Generative Semantic Nursing + + +
+ Emerging large-scale text-to-image generative models, e.g., Stable Diffusion +(SD), have exhibited overwhelming results with high fidelity. Despite the +magnificent progress, current state-of-the-art models still struggle to +generate images fully adhering to the input prompt. Prior work, Attend & +Excite, has introduced the concept of Generative Semantic Nursing (GSN), aiming +to optimize cross-attention during inference time to better incorporate the +semantics. It demonstrates promising results in generating simple prompts, +e.g., ``a cat and a dog''. However, its efficacy declines when dealing with +more complex prompts, and it does not explicitly address the problem of +improper attribute binding. To address the challenges posed by complex prompts +or scenarios involving multiple entities and to achieve improved attribute +binding, we propose Divide & Bind. We introduce two novel loss objectives for +GSN: a novel attendance loss and a binding loss. Our approach stands out in its +ability to faithfully synthesize desired objects with improved attribute +alignment from complex prompts and exhibits superior performance across +multiple evaluation benchmarks. More videos and updates can be found on the +project page \url{https://sites.google.com/view/divide-and-bind}. + +
+
+ comment: Project page: \url{https://sites.google.com/view/divide-and-bind} +
+
+
+
+
+ + ☆ Yelp Reviews and Food Types: A Comparative Analysis of Ratings, + Sentiments, and Topics + + +
+ This study examines the relationship between Yelp reviews and food types, +investigating how ratings, sentiments, and topics vary across different types +of food. Specifically, we analyze how ratings and sentiments of reviews vary +across food types, cluster food types based on ratings and sentiments, infer +review topics using machine learning models, and compare topic distributions +among different food types. Our analyses reveal that some food types have +similar ratings, sentiments, and topics distributions, while others have +distinct patterns. We identify four clusters of food types based on ratings and +sentiments and find that reviewers tend to focus on different topics when +reviewing certain food types. These findings have important implications for +understanding user behavior and cultural influence on digital media platforms +and promoting cross-cultural understanding and appreciation. + +
+
+
+
+
+ + ☆ Cross-Corpus Multilingual Speech Emotion Recognition: Amharic vs. Other + Languages + + +
+ In a conventional Speech emotion recognition (SER) task, a classifier for a +given language is trained on a pre-existing dataset for that same language. +However, where training data for a language does not exist, data from other +languages can be used instead. We experiment with cross-lingual and +multilingual SER, working with Amharic, English, German and URDU. For Amharic, +we use our own publicly-available Amharic Speech Emotion Dataset (ASED). For +English, German and Urdu we use the existing RAVDESS, EMO-DB and URDU datasets. +We followed previous research in mapping labels for all datasets to just two +classes, positive and negative. Thus we can compare performance on different +languages directly, and combine languages for training and testing. In +Experiment 1, monolingual SER trials were carried out using three classifiers, +AlexNet, VGGE (a proposed variant of VGG), and ResNet50. Results averaged for +the three models were very similar for ASED and RAVDESS, suggesting that +Amharic and English SER are equally difficult. Similarly, German SER is more +difficult, and Urdu SER is easier. In Experiment 2, we trained on one language +and tested on another, in both directions for each pair: Amharic<->German, +Amharic<->English, and Amharic<->Urdu. Results with Amharic as target suggested +that using English or German as source will give the best result. In Experiment +3, we trained on several non-Amharic languages and then tested on Amharic. The +best accuracy obtained was several percent greater than the best accuracy in +Experiment 2, suggesting that a better result can be obtained when using two or +three non-Amharic languages for training than when using just one non-Amharic +language. Overall, the results suggest that cross-lingual and multilingual +training can be an effective strategy for training a SER classifier when +resources for a language are scarce. + +
+
+ comment: 16 pages, 9 tables, 5 figures +
+
+
+
+
+ + ☆ Meta-Transformer: A Unified Framework for Multimodal Learning + + +
+ Multimodal learning aims to build models that can process and relate +information from multiple modalities. Despite years of development in this +field, it still remains challenging to design a unified network for processing +various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point +clouds, audio, video, time series, tabular data) due to the inherent gaps among +them. In this work, we propose a framework, named Meta-Transformer, that +leverages a $\textbf{frozen}$ encoder to perform multimodal perception without +any paired multimodal training data. In Meta-Transformer, the raw input data +from various modalities are mapped into a shared token space, allowing a +subsequent encoder with frozen parameters to extract high-level semantic +features of the input data. Composed of three main components: a unified data +tokenizer, a modality-shared encoder, and task-specific heads for downstream +tasks, Meta-Transformer is the first framework to perform unified learning +across 12 modalities with unpaired data. Experiments on different benchmarks +reveal that Meta-Transformer can handle a wide range of tasks including +fundamental perception (text, image, point cloud, audio, video), practical +application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph, +tabular, and time-series). Meta-Transformer indicates a promising future for +developing unified multimodal intelligence with transformers. Code will be +available at https://github.com/invictus717/MetaTransformer + +
+
+ comment: Project website: https://kxgong.github.io/meta_transformer/ +
+
+
+
+
+ + ☆ Layer-wise Representation Fusion for Compositional Generalization + + +
+ Despite successes across a broad range of applications, sequence-to-sequence +models' construct of solutions are argued to be less compositional than +human-like generalization. There is mounting evidence that one of the reasons +hindering compositional generalization is representations of the encoder and +decoder uppermost layer are entangled. In other words, the syntactic and +semantic representations of sequences are twisted inappropriately. However, +most previous studies mainly concentrate on enhancing token-level semantic +information to alleviate the representations entanglement problem, rather than +composing and using the syntactic and semantic representations of sequences +appropriately as humans do. In addition, we explain why the entanglement +problem exists from the perspective of recent studies about training deeper +Transformer, mainly owing to the ``shallow'' residual connections and its +simple, one-step operations, which fails to fuse previous layers' information +effectively. Starting from this finding and inspired by humans' strategies, we +propose \textsc{FuSion} (\textbf{Fu}sing \textbf{S}yntactic and +Semant\textbf{i}c Representati\textbf{on}s), an extension to +sequence-to-sequence models to learn to fuse previous layers' information back +into the encoding and decoding process appropriately through introducing a +\emph{fuse-attention module} at each encoder and decoder layer. \textsc{FuSion} +achieves competitive and even \textbf{state-of-the-art} results on two +realistic benchmarks, which empirically demonstrates the effectiveness of our +proposal. + +
+
+ comment: work in progress. arXiv admin note: substantial text overlap with + arXiv:2305.12169 +
+
+
+
+
+ + ☆ Extreme Multi-Label Skill Extraction Training using Large Language + Models ECML-PKDD 2023 + + +
+ Online job ads serve as a valuable source of information for skill +requirements, playing a crucial role in labor market analysis and e-recruitment +processes. Since such ads are typically formatted in free text, natural +language processing (NLP) technologies are required to automatically process +them. We specifically focus on the task of detecting skills (mentioned +literally, or implicitly described) and linking them to a large skill ontology, +making it a challenging case of extreme multi-label classification (XMLC). +Given that there is no sizable labeled (training) dataset are available for +this specific XMLC task, we propose techniques to leverage general Large +Language Models (LLMs). We describe a cost-effective approach to generate an +accurate, fully synthetic labeled dataset for skill extraction, and present a +contrastive learning strategy that proves effective in the task. Our results +across three skill extraction benchmarks show a consistent increase of between +15 to 25 percentage points in \textit{R-Precision@5} compared to previously +published results that relied solely on distant supervision through literal +matches. + +
+
+ comment: Accepted to the International workshop on AI for Human Resources and + Public Employment Services (AI4HR&PES) as part of ECML-PKDD 2023 +
+
+
+
+
+ + ☆ Vesper: A Compact and Effective Pretrained Model for Speech Emotion + Recognition + + +
+ This paper presents a paradigm that adapts general large-scale pretrained +models (PTMs) to speech emotion recognition task. Although PTMs shed new light +on artificial general intelligence, they are constructed with general tasks in +mind, and thus, their efficacy for specific tasks can be further improved. +Additionally, employing PTMs in practical applications can be challenging due +to their considerable size. Above limitations spawn another research direction, +namely, optimizing large-scale PTMs for specific tasks to generate +task-specific PTMs that are both compact and effective. In this paper, we focus +on the speech emotion recognition task and propose an improved emotion-specific +pretrained encoder called Vesper. Vesper is pretrained on a speech dataset +based on WavLM and takes into account emotional characteristics. To enhance +sensitivity to emotional information, Vesper employs an emotion-guided masking +strategy to identify the regions that need masking. Subsequently, Vesper +employs hierarchical and cross-layer self-supervision to improve its ability to +capture acoustic and semantic representations, both of which are crucial for +emotion recognition. Experimental results on the IEMOCAP, MELD, and CREMA-D +datasets demonstrate that Vesper with 4 layers outperforms WavLM Base with 12 +layers, and the performance of Vesper with 12 layers surpasses that of WavLM +Large with 24 layers. + +
+
+ comment: 13 pages, 5 figures, 8 tables +
+
+
+
+
+ + ☆ Exploring Perspectives on the Impact of Artificial Intelligence on the + Creativity of Knowledge Work: Beyond Mechanised Plagiarism and Stochastic + Parrots + + +
+ Artificial Intelligence (AI), and in particular generative models, are +transformative tools for knowledge work. They problematise notions of +creativity, originality, plagiarism, the attribution of credit, and copyright +ownership. Critics of generative models emphasise the reliance on large amounts +of training data, and view the output of these models as no more than +randomised plagiarism, remix, or collage of the source data. On these grounds, +many have argued for stronger regulations on the deployment, use, and +attribution of the output of these models. However, these issues are not new or +unique to artificial intelligence. In this position paper, using examples from +literary criticism, the history of art, and copyright law, I show how +creativity and originality resist definition as a notatable or +information-theoretic property of an object, and instead can be seen as the +property of a process, an author, or a viewer. Further alternative views hold +that all creative work is essentially reuse (mostly without attribution), or +that randomness itself can be creative. I suggest that creativity is ultimately +defined by communities of creators and receivers, and the deemed sources of +creativity in a workflow often depend on which parts of the workflow can be +automated. Using examples from recent studies of AI in creative knowledge work, +I suggest that AI shifts knowledge work from material production to critical +integration. This position paper aims to begin a conversation around a more +nuanced approach to the problems of creativity and credit assignment for +generative models, one which more fully recognises the importance of the +creative and curatorial voice of the users of these models and moves away from +simpler notational or information-theoretic views. + +
+
+ comment: Advait Sarkar. 2023. Exploring Perspectives on the Impact of + Artificial Intelligence on the Creativity of Knowledge Work Beyond Mechanised + Plagiarism and Stochastic Parrots. In Annual Symposium on Human-Computer + Interaction for Work 2023 (CHIWORK 2023), June 13-16, 2023, Oldenburg, + Germany. ACM, New York, NY, USA, 17 pages +
+
+
+
+
+ + ☆ Large language models shape and are shaped by society: A survey of arXiv + publication patterns + + +
+ There has been a steep recent increase in the number of large language model +(LLM) papers, producing a dramatic shift in the scientific landscape which +remains largely undocumented through bibliometric analysis. Here, we analyze +388K papers posted on the CS and Stat arXivs, focusing on changes in +publication patterns in 2023 vs. 2018-2022. We analyze how the proportion of +LLM papers is increasing; the LLM-related topics receiving the most attention; +the authors writing LLM papers; how authors' research topics correlate with +their backgrounds; the factors distinguishing highly cited LLM papers; and the +patterns of international collaboration. We show that LLM research increasingly +focuses on societal impacts: there has been an 18x increase in the proportion +of LLM-related papers on the Computers and Society sub-arXiv, and authors newly +publishing on LLMs are more likely to focus on applications and societal +impacts than more experienced authors. LLM research is also shaped by social +dynamics: we document gender and academic/industry disparities in the topics +LLM authors focus on, and a US/China schism in the collaboration network. +Overall, our analysis documents the profound ways in which LLM research both +shapes and is shaped by society, attesting to the necessity of sociotechnical +lenses. + +
+
+ comment: Working paper +
+
+
+
+
+ + ☆ A Dataset and Strong Baselines for Classification of Czech News Texts + + +
+ Pre-trained models for Czech Natural Language Processing are often evaluated +on purely linguistic tasks (POS tagging, parsing, NER) and relatively simple +classification tasks such as sentiment classification or article classification +from a single news source. As an alternative, we present +CZEch~NEws~Classification~dataset (CZE-NEC), one of the largest Czech +classification datasets, composed of news articles from various sources +spanning over twenty years, which allows a more rigorous evaluation of such +models. We define four classification tasks: news source, news category, +inferred author's gender, and day of the week. To verify the task difficulty, +we conducted a human evaluation, which revealed that human performance lags +behind strong machine-learning baselines built upon pre-trained transformer +models. Furthermore, we show that language-specific pre-trained encoder +analysis outperforms selected commercially available large-scale generative +language models. + +
+
+ comment: 12 pages, Accepted to Text, Speech and Dialogue (TSD) 2023 +
+
+
+
+
+ + ☆ Exploring the Landscape of Natural Language Processing Research + + +
+ As an efficient approach to understand, generate, and process natural +language texts, research in natural language processing (NLP) has exhibited a +rapid spread and wide adoption in recent years. Given the increasing amount of +research work in this area, several NLP-related approaches have been surveyed +in the research community. However, a comprehensive study that categorizes +established topics, identifies trends, and outlines areas for future research +remains absent to this day. Contributing to closing this gap, we have +systematically classified and analyzed research papers included in the ACL +Anthology. As a result, we present a structured overview of the research +landscape, provide a taxonomy of fields-of-study in NLP, analyze recent +developments in NLP, summarize our findings, and highlight directions for +future work. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ☆ SciBench: Evaluating College-Level Scientific Problem-Solving Abilities + of Large Language Models + + +
+ Recent advances in large language models (LLMs) have demonstrated notable +progress on many mathematical benchmarks. However, most of these benchmarks +only feature problems grounded in junior and senior high school subjects, +contain only multiple-choice questions, and are confined to a limited scope of +elementary arithmetic operations. To address these issues, this paper +introduces an expansive benchmark suite SciBench that aims to systematically +examine the reasoning capabilities required for complex scientific problem +solving. SciBench contains two carefully curated datasets: an open set +featuring a range of collegiate-level scientific problems drawn from +mathematics, chemistry, and physics textbooks, and a closed set comprising +problems from undergraduate-level exams in computer science and mathematics. +Based on the two datasets, we conduct an in-depth benchmark study of two +representative LLMs with various prompting strategies. The results reveal that +current LLMs fall short of delivering satisfactory performance, with an overall +score of merely 35.80%. Furthermore, through a detailed user study, we +categorize the errors made by LLMs into ten problem-solving abilities. Our +analysis indicates that no single prompting strategy significantly outperforms +others and some strategies that demonstrate improvements in certain +problem-solving skills result in declines in other skills. We envision that +SciBench will catalyze further developments in the reasoning abilities of LLMs, +thereby ultimately contributing to scientific research and discovery. + +
+
+ comment: Work in progress, 18 pages +
+
+
+
+
+ + ☆ Generative Language Models on Nucleotide Sequences of Human Genes + + +
+ Language models, primarily transformer-based ones, obtained colossal success +in NLP. To be more precise, studies like BERT in NLU and works such as GPT-3 +for NLG are very crucial. DNA sequences are very close to natural language in +terms of structure, so if the DNA-related bioinformatics domain is concerned, +discriminative models, like DNABert, exist. Yet, the generative side of the +coin is mainly unexplored to the best of our knowledge. Consequently, we +focused on developing an autoregressive generative language model like GPT-3 +for DNA sequences. Because working with whole DNA sequences is challenging +without substantial computational resources, we decided to carry out our study +on a smaller scale, focusing on nucleotide sequences of human genes, unique +parts in DNA with specific functionalities, instead of the whole DNA. This +decision did not change the problem structure a lot due to the fact that both +DNA and genes can be seen as 1D sequences consisting of four different +nucleotides without losing much information and making too much simplification. +First of all, we systematically examined an almost entirely unexplored problem +and observed that RNNs performed the best while simple techniques like N-grams +were also promising. Another beneficial point was learning how to work with +generative models on languages we do not understand, unlike natural language. +How essential using real-life tasks beyond the classical metrics such as +perplexity is observed. Furthermore, checking whether the data-hungry nature of +these models can be changed through selecting a language with minimal +vocabulary size, four owing to four different types of nucleotides, is +examined. The reason for reviewing this was that choosing such a language might +make the problem easier. However, what we observed in this study was it did not +provide that much of a change in the amount of data needed. + +
+
+
+
+
+ + ☆ Multi-Method Self-Training: Improving Code Generation With Text, And + Vice Versa + + +
+ Large Language Models have many methods for solving the same problem. This +introduces novel strengths (different methods may work well for different +problems) and weaknesses (it may be difficult for users to know which method to +use). In this paper, we introduce Multi-Method Self-Training (MMST), where one +method is trained on the filtered outputs of another, allowing us to augment +the strengths and ameliorate the weaknesses of each method. Using a 176B +parameter model trained on both language and code, we show that MMST can 1) +improve the less performant method (up to 30%) making the model easier to use, +2) improve the more performant method (up to 32.2%) making the model more +performant, and 3) improve the performance of related but distinct tasks (up to +10.3%) by improving the ability of the model to generate rationales. We then +conduct ablation analyses to explore why MMST works. We show that MMST +generates more data than traditional self-training, but the improvement in +performance is driven by the use of multiple methods. We also analyze +prompt-engineering and anti-correlated performance between methods as means of +making MMST more effective. We hope the evidence from our paper motivates +machine learning researchers to explore ways in which advances in language +models allow for new forms of training. + +
+
+ comment: 23 pages, 3 figures +
+
+
+
+
+ + ☆ A Deep Dive into the Disparity of Word Error Rates Across Thousands of + NPTEL MOOC Videos + + +
+ Automatic speech recognition (ASR) systems are designed to transcribe spoken +language into written text and find utility in a variety of applications +including voice assistants and transcription services. However, it has been +observed that state-of-the-art ASR systems which deliver impressive benchmark +results, struggle with speakers of certain regions or demographics due to +variation in their speech properties. In this work, we describe the curation of +a massive speech dataset of 8740 hours consisting of $\sim9.8$K technical +lectures in the English language along with their transcripts delivered by +instructors representing various parts of Indian demography. The dataset is +sourced from the very popular NPTEL MOOC platform. We use the curated dataset +to measure the existing disparity in YouTube Automatic Captions and OpenAI +Whisper model performance across the diverse demographic traits of speakers in +India. While there exists disparity due to gender, native region, age and +speech rate of speakers, disparity based on caste is non-existent. We also +observe statistically significant disparity across the disciplines of the +lectures. These results indicate the need of more inclusive and robust ASR +systems and more representational datasets for disparity evaluation in them. + +
+
+
+
+
+ + ☆ Instruction-following Evaluation through Verbalizer Manipulation + + +
+ While instruction-tuned models have shown remarkable success in various +natural language processing tasks, accurately evaluating their ability to +follow instructions remains challenging. Existing benchmarks primarily focus on +common instructions that align well with what the model learned during +training. However, proficiency in responding to these instructions does not +necessarily imply strong ability in instruction following. In this paper, we +propose a novel instruction-following evaluation protocol called verbalizer +manipulation. It instructs the model to verbalize the task label with words +aligning with model priors to different extents, adopting verbalizers from +highly aligned (e.g., outputting ``postive'' for positive sentiment), to +minimally aligned (e.g., outputting ``negative'' for positive sentiment). +Verbalizer manipulation can be seamlessly integrated with any classification +benchmark to examine the model's reliance on priors and its ability to override +them to accurately follow the instructions. We conduct a comprehensive +evaluation of four major model families across nine datasets, employing twelve +sets of verbalizers for each of them. We observe that the instruction-following +abilities of models, across different families and scales, are significantly +distinguished by their performance on less natural verbalizers. Even the +strongest GPT-4 model struggles to perform better than random guessing on the +most challenging verbalizer, emphasizing the need for continued advancements to +improve their instruction-following abilities. + +
+
+
+
+
+ + ☆ Dynamic Large Language Models on Blockchains + + +
+ Training and deploying the large language models requires a large mount of +computational resource because the language models contain billions of +parameters and the text has thousands of tokens. Another problem is that the +large language models are static. They are fixed after the training process. To +tackle these issues, in this paper, we propose to train and deploy the dynamic +large language model on blockchains, which have high computation performance +and are distributed across a network of computers. A blockchain is a secure, +decentralized, and transparent system that allows for the creation of a +tamper-proof ledger for transactions without the need for intermediaries. The +dynamic large language models can continuously learn from the user input after +the training process. Our method provides a new way to develop the large +language models and also sheds a light on the next generation artificial +intelligence systems. + +
+
+
+
+
+ + ☆ Gender-tuning: Empowering Fine-tuning for Debiasing Pre-trained Language + Models + + +
+ Recent studies have revealed that the widely-used Pre-trained Language Models +(PLMs) propagate societal biases from the large unmoderated pre-training +corpora. Existing solutions require debiasing training processes and datasets +for debiasing, which are resource-intensive and costly. Furthermore, these +methods hurt the PLMs' performance on downstream tasks. In this study, we +propose Gender-tuning, which debiases the PLMs through fine-tuning on +downstream tasks' datasets. For this aim, Gender-tuning integrates Masked +Language Modeling (MLM) training objectives into fine-tuning's training +process. Comprehensive experiments show that Gender-tuning outperforms the +state-of-the-art baselines in terms of average gender bias scores in PLMs while +improving PLMs' performance on downstream tasks solely using the downstream +tasks' dataset. Also, Gender-tuning is a deployable debiasing tool for any PLM +that works with original fine-tuning. + +
+
+
+
+
+ + ☆ Building Socio-culturally Inclusive Stereotype Resources with Community + Engagement + + +
+ With rapid development and deployment of generative language models in global +settings, there is an urgent need to also scale our measurements of harm, not +just in the number and types of harms covered, but also how well they account +for local cultural contexts, including marginalized identities and the social +biases experienced by them. Current evaluation paradigms are limited in their +abilities to address this, as they are not representative of diverse, locally +situated but global, socio-cultural perspectives. It is imperative that our +evaluation resources are enhanced and calibrated by including people and +experiences from different cultures and societies worldwide, in order to +prevent gross underestimations or skews in measurements of harm. In this work, +we demonstrate a socio-culturally aware expansion of evaluation resources in +the Indian societal context, specifically for the harm of stereotyping. We +devise a community engaged effort to build a resource which contains +stereotypes for axes of disparity that are uniquely present in India. The +resultant resource increases the number of stereotypes known for and in the +Indian context by over 1000 stereotypes across many unique identities. We also +demonstrate the utility and effectiveness of such expanded resources for +evaluations of language models. CONTENT WARNING: This paper contains examples +of stereotypes that may be offensive. + +
+
+
+
+
+ + ☆ IvyGPT: InteractiVe Chinese pathwaY language model in medical domain + + +
+ General large language models (LLMs) such as ChatGPT have shown remarkable +success. However, such LLMs have not been widely adopted for medical purposes, +due to poor accuracy and inability to provide medical advice. We propose +IvyGPT, an LLM based on LLaMA that is trained and fine-tuned with high-quality +medical question-answer (QA) instances and Reinforcement Learning from Human +Feedback (RLHF). After supervised fine-tuning, IvyGPT has good multi-turn +conversation capabilities, but it cannot perform like a doctor in other +aspects, such as comprehensive diagnosis. Through RLHF, IvyGPT can output +richer diagnosis and treatment answers that are closer to human. In the +training, we used QLoRA to train 33 billion parameters on a small number of +NVIDIA A100 (80GB) GPUs. Experimental results show that IvyGPT has outperformed +other medical GPT models. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ General Debiasing for Multimodal Sentiment Analysis + + +
+ Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal +information for prediction yet unavoidably suffers from fitting the spurious +correlations between multimodal features and sentiment labels. For example, if +most videos with a blue background have positive labels in a dataset, the model +will rely on such correlations for prediction, while ``blue background'' is not +a sentiment-related feature. To address this problem, we define a general +debiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD) +generalization ability of MSA models by reducing their reliance on spurious +correlations. To this end, we propose a general debiasing framework based on +Inverse Probability Weighting (IPW), which adaptively assigns small weights to +the samples with larger bias i.e., the severer spurious correlations). The key +to this debiasing framework is to estimate the bias of each sample, which is +achieved by two steps: 1) disentangling the robust features and biased features +in each modality, and 2) utilizing the biased features to estimate the bias. +Finally, we employ IPW to reduce the effects of large-biased samples, +facilitating robust feature learning for sentiment prediction. To examine the +model's generalization ability, we keep the original testing sets on two +benchmarks and additionally construct multiple unimodal and multimodal OOD +testing sets. The empirical results demonstrate the superior generalization +ability of our proposed framework. We have released the code and data to +facilitate the reproduction. + +
+
+
+
+
+ + ☆ A Systematic Evaluation of Federated Learning on Biomedical Natural + Language Processing KDD 2023 + + +
+ Language models (LMs) like BERT and GPT have revolutionized natural language +processing (NLP). However, privacy-sensitive domains, particularly the medical +field, face challenges to train LMs due to limited data access and privacy +constraints imposed by regulations like the Health Insurance Portability and +Accountability Act (HIPPA) and the General Data Protection Regulation (GDPR). +Federated learning (FL) offers a decentralized solution that enables +collaborative learning while ensuring the preservation of data privacy. In this +study, we systematically evaluate FL in medicine across $2$ biomedical NLP +tasks using $6$ LMs encompassing $8$ corpora. Our results showed that: 1) FL +models consistently outperform LMs trained on individual client's data and +sometimes match the model trained with polled data; 2) With the fixed number of +total data, LMs trained using FL with more clients exhibit inferior +performance, but pre-trained transformer-based models exhibited greater +resilience. 3) LMs trained using FL perform nearly on par with the model +trained with pooled data when clients' data are IID distributed while +exhibiting visible gaps with non-IID data. Our code is available at: +https://github.com/PL97/FedNLP + +
+
+ comment: Accepted by KDD 2023 Workshop FL4Data-Mining +
+
+
+
+
+ + ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding + Models EMNLP 2023 + + +
+ Jina Embeddings constitutes a set of high-performance sentence embedding +models adept at translating various textual inputs into numerical +representations, thereby capturing the semantic essence of the text. While +these models are not exclusively designed for text generation, they excel in +applications such as dense retrieval and semantic textual similarity. This +paper details the development of Jina Embeddings, starting with the creation of +a high-quality pairwise and triplet dataset. It underlines the crucial role of +data cleaning in dataset preparation, gives in-depth insights into the model +training process, and concludes with a comprehensive performance evaluation +using the Massive Textual Embedding Benchmark (MTEB). + +
+
+ comment: 9 pages, 2 page appendix, EMNLP 2023 Industrial Track +
+
+
+
+
+ + ☆ UMLS-KGI-BERT: Data-Centric Knowledge Integration in Transformers for + Biomedical Entity Recognition + + +
+ Pre-trained transformer language models (LMs) have in recent years become the +dominant paradigm in applied NLP. These models have achieved state-of-the-art +performance on tasks such as information extraction, question answering, +sentiment analysis, document classification and many others. In the biomedical +domain, significant progress has been made in adapting this paradigm to NLP +tasks that require the integration of domain-specific knowledge as well as +statistical modelling of language. In particular, research in this area has +focused on the question of how best to construct LMs that take into account not +only the patterns of token distribution in medical text, but also the wealth of +structured information contained in terminology resources such as the UMLS. +This work contributes a data-centric paradigm for enriching the language +representations of biomedical transformer-encoder LMs by extracting text +sequences from the UMLS. This allows for graph-based learning objectives to be +combined with masked-language pre-training. Preliminary results from +experiments in the extension of pre-trained LMs as well as training from +scratch show that this framework improves downstream performance on multiple +biomedical and clinical Named Entity Recognition (NER) tasks. + +
+
+
+
+
+ + ☆ Applying QNLP to sentiment analysis in finance + + +
+ As an application domain where the slightest qualitative improvements can +yield immense value, finance is a promising candidate for early quantum +advantage. Focusing on the rapidly advancing field of Quantum Natural Language +Processing (QNLP), we explore the practical applicability of the two central +approaches DisCoCat and Quantum-Enhanced Long Short-Term Memory (QLSTM) to the +problem of sentiment analysis in finance. Utilizing a novel ChatGPT-based data +generation approach, we conduct a case study with more than 1000 realistic +sentences and find that QLSTMs can be trained substantially faster than +DisCoCat while also achieving close to classical results for their available +software implementations. + +
+
+
+
+
+ + ☆ LLM Cognitive Judgements Differ From Human + + +
+ Large Language Models (LLMs) have lately been on the spotlight of +researchers, businesses, and consumers alike. While the linguistic capabilities +of such models have been studied extensively, there is growing interest in +investigating them as cognitive subjects. In the present work I examine GPT-3 +and ChatGPT capabilities on an limited-data inductive reasoning task from the +cognitive science literature. The results suggest that these models' cognitive +judgements are not human-like. + +
+
+ comment: 7 pages, 1 figure +
+
+
+
+
+ + ☆ Adversarial Conversational Shaping for Intelligent Agents + + +
+ The recent emergence of deep learning methods has enabled the research +community to achieve state-of-the art results in several domains including +natural language processing. However, the current robocall system remains +unstable and inaccurate: text generator and chat-bots can be tedious and +misunderstand human-like dialogue. In this work, we study the performance of +two models able to enhance an intelligent conversational agent through +adversarial conversational shaping: a generative adversarial network with +policy gradient (GANPG) and a generative adversarial network with reward for +every generation step (REGS) based on the REGS model presented in Li et al. +[18] . This model is able to assign rewards to both partially and fully +generated text sequences. We discuss performance with different training +details : seq2seq [ 36] and transformers [37 ] in a reinforcement learning +framework. + +
+
+
+
+
+ + ♻ ☆ DialogStudio: Towards Richest and Most Diverse Unified Dataset + Collection for Conversational AI + + +
+ Despite advancements in conversational AI, language models encounter +challenges to handle diverse conversational tasks, and existing dialogue +dataset collections often lack diversity and comprehensiveness. To tackle these +issues, we introduce DialogStudio: the largest and most diverse collection of +dialogue datasets, unified under a consistent format while preserving their +original information. Our collection encompasses data from open-domain +dialogues, task-oriented dialogues, natural language understanding, +conversational recommendation, dialogue summarization, and knowledge-grounded +dialogues, making it an incredibly rich and diverse resource for dialogue +research and model training. To further enhance the utility of DialogStudio, we +identify the licenses for each dataset and design domain-aware prompts for +selected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we +develop conversational AI models using the dataset collection, and our +experiments in both zero-shot and few-shot learning scenarios demonstrate the +superiority of DialogStudio. To improve transparency and support dataset and +task-based research, as well as language model pre-training, all datasets, +licenses, codes, and models associated with DialogStudio are made publicly +accessible at https://github.com/salesforce/DialogStudio + +
+
+
+
+
+ + ♻ ☆ Mathematical Capabilities of ChatGPT + + +
+ We investigate the mathematical capabilities of two iterations of ChatGPT +(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on +publicly available datasets, as well as hand-crafted ones, using a novel +methodology. In contrast to formal mathematics, where large databases of formal +proofs are available (e.g., the Lean Mathematical Library), current datasets of +natural-language mathematics, used to benchmark language models, either cover +only elementary mathematics or are very small. We address this by publicly +releasing two new datasets: GHOSTS and miniGHOSTS. These are the first +natural-language datasets curated by working researchers in mathematics that +(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of +the mathematical capabilities of language models, and (3) distinguish multiple +dimensions of mathematical reasoning. These datasets also test whether ChatGPT +and GPT-4 can be helpful assistants to professional mathematicians by emulating +use cases that arise in the daily professional activities of mathematicians. We +benchmark the models on a range of fine-grained performance metrics. For +advanced mathematics, this is the most detailed evaluation effort to date. We +find that ChatGPT can be used most successfully as a mathematical assistant for +querying facts, acting as a mathematical search engine and knowledge base +interface. GPT-4 can additionally be used for undergraduate-level mathematics +but fails on graduate-level difficulty. Contrary to many positive reports in +the media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of +selection bias), their overall mathematical performance is well below the level +of a graduate student. Hence, if your goal is to use ChatGPT to pass a +graduate-level math exam, you would be better off copying from your average +peer! + +
+
+ comment: Added further evaluations on another ChatGPT version and on GPT-4. + The GHOSTS and miniGHOSTS datasets are available at + https://github.com/xyfrieder/science-GHOSTS +
+
+
+
+
+ + ♻ ☆ Sabiá: Portuguese Large Language Models + + +
+ As the capabilities of language models continue to advance, it is conceivable +that "one-size-fits-all" model will remain as the main paradigm. For instance, +given the vast number of languages worldwide, many of which are low-resource, +the prevalent practice is to pretrain a single model on multiple languages. In +this paper, we add to the growing body of evidence that challenges this +practice, demonstrating that monolingual pretraining on the target language +significantly improves models already extensively trained on diverse corpora. +More specifically, we further pretrain GPT-J and LLaMA models on Portuguese +texts using 3% or less of their original pretraining budget. Few-shot +evaluations on Poeta, a suite of 14 Portuguese datasets, reveal that our models +outperform English-centric and multilingual counterparts by a significant +margin. Our best model, Sabi\'a-65B, performs on par with GPT-3.5-turbo. By +evaluating on datasets originally conceived in the target language as well as +translated ones, we study the contributions of language-specific pretraining in +terms of 1) capturing linguistic nuances and structures inherent to the target +language, and 2) enriching the model's knowledge about a domain or culture. Our +results indicate that the majority of the benefits stem from the +domain-specific knowledge acquired through monolingual pretraining. + +
+
+
+
+
+ + ♻ ☆ MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model CVPR 2023 + + +
+ Multimodal semantic understanding often has to deal with uncertainty, which +means the obtained messages tend to refer to multiple targets. Such uncertainty +is problematic for our interpretation, including inter- and intra-modal +uncertainty. Little effort has studied the modeling of this uncertainty, +particularly in pre-training on unlabeled datasets and fine-tuning in +task-specific downstream datasets. In this paper, we project the +representations of all modalities as probabilistic distributions via a +Probability Distribution Encoder (PDE) by utilizing sequence-level +interactions. Compared to the existing deterministic methods, such uncertainty +modeling can convey richer multimodal semantic information and more complex +relationships. Furthermore, we integrate uncertainty modeling with popular +pre-training frameworks and propose suitable pre-training tasks: +Distribution-based Vision-Language Contrastive learning (D-VLC), +Distribution-based Masked Language Modeling (D-MLM), and Distribution-based +Image-Text Matching (D-ITM). The fine-tuned models are applied to challenging +downstream tasks, including image-text retrieval, visual question answering, +visual reasoning, and visual entailment, and achieve state-of-the-art results. + +
+
+ comment: CVPR 2023 Main Track Long Paper +
+
+
+
+
+ + ♻ ☆ Class-Incremental Learning based on Label Generation ACL 2023 + + +
+ Despite the great success of pre-trained language models, it is still a +challenge to use these models for continual learning, especially for the +class-incremental learning (CIL) setting due to catastrophic forgetting (CF). +This paper reports our finding that if we formulate CIL as a continual label +generation problem, CF is drastically reduced and the generalizable +representations of pre-trained models can be better retained. We thus propose a +new CIL method (VAG) that also leverages the sparsity of vocabulary to focus +the generation and creates pseudo-replay samples by using label semantics. +Experimental results show that VAG outperforms baselines by a large margin. + +
+
+ comment: 12 pages, ACL 2023 Main Conference +
+
+
+
+
+ + ♻ ☆ $α$-$β$-Factorization and the Binary Case of Simon's Congruence + + +
+ In 1991 H\'ebrard introduced a factorization of words that turned out to be a +powerful tool for the investigation of a word's scattered factors (also known +as (scattered) subwords or subsequences). Based on this, first Karandikar and +Schnoebelen introduced the notion of $k$-richness and later on Barker et al. +the notion of $k$-universality. In 2022 Fleischmann et al. presented a +generalization of the arch factorization by intersecting the arch factorization +of a word and its reverse. While the authors merely used this factorization for +the investigation of shortest absent scattered factors, in this work we +investigate this new $\alpha$-$\beta$-factorization as such. We characterize +the famous Simon congruence of $k$-universal words in terms of $1$-universal +words. Moreover, we apply these results to binary words. In this special case, +we obtain a full characterization of the classes and calculate the index of the +congruence. Lastly, we start investigating the ternary case, present a full +list of possibilities for $\alpha\beta\alpha$-factors, and characterize their +congruence. + +
+
+
+
+
+ + ♻ ☆ My Boli: Code-mixed Marathi-English Corpora, Pretrained Language Models + and Evaluation Benchmarks + + +
+ The research on code-mixed data is limited due to the unavailability of +dedicated code-mixed datasets and pre-trained language models. In this work, we +focus on the low-resource Indian language Marathi which lacks any prior work in +code-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English +(Mr-En) corpus with 10 million social media sentences for pretraining. We also +release L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models +pre-trained on MeCorpus. Furthermore, for benchmarking, we present three +supervised datasets MeHate, MeSent, and MeLID for downstream tasks like +code-mixed Mr-En hate speech detection, sentiment analysis, and language +identification respectively. These evaluation datasets individually consist of +manually annotated \url{~}12,000 Marathi-English code-mixed tweets. Ablations +show that the models trained on this novel corpus significantly outperform the +existing state-of-the-art BERT models. This is the first work that presents +artifacts for code-mixed Marathi research. All datasets and models are publicly +released at https://github.com/l3cube-pune/MarathiNLP . + +
+
+
+
+
+ + ♻ ☆ RadAdapt: Radiology Report Summarization via Lightweight Domain + Adaptation of Large Language Models ACL + + +
+ We systematically investigate lightweight strategies to adapt large language +models (LLMs) for the task of radiology report summarization (RRS). +Specifically, we focus on domain adaptation via pretraining (on natural +language, biomedical text, or clinical text) and via discrete prompting or +parameter-efficient fine-tuning. Our results consistently achieve best +performance by maximally adapting to the task via pretraining on clinical text +and fine-tuning on RRS examples. Importantly, this method fine-tunes a mere +0.32% of parameters throughout the model, in contrast to end-to-end fine-tuning +(100% of parameters). Additionally, we study the effect of in-context examples +and out-of-distribution (OOD) training before concluding with a radiologist +reader study and qualitative analysis. Our findings highlight the importance of +domain adaptation in RRS and provide valuable insights toward developing +effective natural language processing solutions for clinical tasks. + +
+
+ comment: 12 pages, 10 figures. Published in ACL BioNLP. Compared to v1, v2 + includes minor edits and one additional figure in the appendix. Compared to + v2, v3 includes a link to the project's GitHub repository +
+
+
+
+
+ + ♻ ☆ A Textless Metric for Speech-to-Speech Comparison + + +
+ In this paper, we introduce a new and simple method for comparing speech +utterances without relying on text transcripts. Our speech-to-speech comparison +metric utilizes state-of-the-art speech2unit encoders like HuBERT to convert +speech utterances into discrete acoustic units. We then propose a simple and +easily replicable neural architecture that learns a speech-based metric that +closely corresponds to its text-based counterpart. This textless metric has +numerous potential applications, including evaluating speech-to-speech +translation for oral languages, languages without dependable ASR systems, or to +avoid the need for ASR transcription altogether. This paper also shows that for +speech-to-speech translation evaluation, ASR-BLEU (which consists in +automatically transcribing both speech hypothesis and reference and compute +sentence-level BLEU between transcripts) is a poor proxy to real text-BLEU even +when ASR system is strong. + +
+
+ comment: link to supplementary material: + https://github.com/besacier/textless-metric +
+
+
+
+
+ + ♻ ☆ Science in the Era of ChatGPT, Large Language Models and Generative AI: + Challenges for Research Ethics and How to Respond + + +
+ Large language models of artificial intelligence (AI), such as ChatGPT, find +remarkable but controversial applicability in science and research. This paper +reviews epistemological challenges, ethical and integrity risks in science +conduct in the advent of generative AI. This is with the aim to lay new timely +foundations for a high-quality research ethics review. The role of AI language +models as a research instrument and subject is scrutinized along with ethical +implications for scientists, participants and reviewers. New emerging practices +for research ethics review are discussed, concluding with ten recommendations +that shape a response for a more responsible research conduct in the era of AI. + +
+
+
+
+
+ + ♻ ☆ ThoughtSource: A central hub for large language model reasoning data + + +
+ Large language models (LLMs) such as GPT-4 have recently demonstrated +impressive results across a wide range of tasks. LLMs are still limited, +however, in that they frequently fail at complex reasoning, their reasoning +processes are opaque, they are prone to 'hallucinate' facts, and there are +concerns about their underlying biases. Letting models verbalize reasoning +steps as natural language, a technique known as chain-of-thought prompting, has +recently been proposed as a way to address some of these issues. Here we +present ThoughtSource, a meta-dataset and software library for chain-of-thought +(CoT) reasoning. The goal of ThoughtSource is to improve future artificial +intelligence systems by facilitating qualitative understanding of CoTs, +enabling empirical evaluations, and providing training data. This first release +of ThoughtSource integrates six scientific/medical, three general-domain and +five math word question answering datasets. + +
+
+ comment: Revision: added datasets, formatting +
+
+
+
+
+ + ♻ ☆ ABNIRML: Analyzing the Behavior of Neural IR Models ACL + + +
+ Pretrained contextualized language models such as BERT and T5 have +established a new state-of-the-art for ad-hoc search. However, it is not yet +well-understood why these methods are so effective, what makes some variants +more effective than others, and what pitfalls they may have. We present a new +comprehensive framework for Analyzing the Behavior of Neural IR ModeLs +(ABNIRML), which includes new types of diagnostic probes that allow us to test +several characteristics -- such as writing styles, factuality, sensitivity to +paraphrasing and word order -- that are not addressed by previous techniques. +To demonstrate the value of the framework, we conduct an extensive empirical +study that yields insights into the factors that contribute to the neural +model's gains, and identify potential unintended biases the models exhibit. +Some of our results confirm conventional wisdom, like that recent neural +ranking models rely less on exact term overlap with the query, and instead +leverage richer linguistic information, evidenced by their higher sensitivity +to word and sentence order. Other results are more surprising, such as that +some models (e.g., T5 and ColBERT) are biased towards factually correct (rather +than simply relevant) texts. Further, some characteristics vary even for the +same base language model, and other characteristics can appear due to random +variations during model training. + +
+
+ comment: TACL version +
+
+
+
+
+ + ♻ ☆ Boosting Language Models Reasoning with Chain-of-Knowledge Prompting + + +
+ Recently, Chain-of-Thought (CoT) prompting has delivered success on complex +reasoning tasks, which aims at designing a simple prompt like ``Let's think +step by step'' or multiple in-context exemplars with well-designed rationales +to elicit Large Language Models (LLMs) to generate intermediate reasoning +steps. However, the generated rationales often come with mistakes, making +unfactual and unfaithful reasoning chains. To mitigate this brittleness, we +propose a novel Chain-of-Knowledge (CoK) prompting, where we aim at eliciting +LLMs to generate explicit pieces of knowledge evidence in the form of structure +triple. This is inspired by our human behaviors, i.e., we can draw a mind map +or knowledge map as the reasoning evidence in the brain before answering a +complex question. Benefiting from CoK, we additionally introduce a +F^2-Verification method to estimate the reliability of the reasoning chains in +terms of factuality and faithfulness. For the unreliable response, the wrong +evidence can be indicated to prompt the LLM to rethink. Extensive experiments +demonstrate that our method can further improve the performance of commonsense, +factual, symbolic, and arithmetic reasoning tasks. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Positive-Augmented Contrastive Learning for Image and Video Captioning + Evaluation CVPR 2023 + + +
+ The CLIP model has been recently proven to be very effective for a variety of +cross-modal tasks, including the evaluation of captions generated from +vision-and-language architectures. In this paper, we propose a new recipe for a +contrastive-based evaluation metric for image captioning, namely +Positive-Augmented Contrastive learning Score (PAC-S), that in a novel way +unifies the learning of a contrastive visual-semantic space with the addition +of generated images and text on curated data. Experiments spanning several +datasets demonstrate that our new metric achieves the highest correlation with +human judgments on both images and videos, outperforming existing +reference-based metrics like CIDEr and SPICE and reference-free metrics like +CLIP-Score. Finally, we test the system-level correlation of the proposed +metric when considering popular image captioning approaches, and assess the +impact of employing different cross-modal features. Our source code and trained +models are publicly available at: https://github.com/aimagelab/pacscore. + +
+
+ comment: CVPR 2023 (highlight paper) +
+
+
+
+
+ + ♻ ☆ MotionGPT: Human Motion as a Foreign Language + + +
+ Though the advancement of pre-trained large language models unfolds, the +exploration of building a unified model for language and other multi-modal +data, such as motion, remains challenging and untouched so far. Fortunately, +human motion displays a semantic coupling akin to human language, often +perceived as a form of body language. By fusing language data with large-scale +motion models, motion-language pre-training that can enhance the performance of +motion-related tasks becomes feasible. Driven by this insight, we propose +MotionGPT, a unified, versatile, and user-friendly motion-language model to +handle multiple motion-relevant tasks. Specifically, we employ the discrete +vector quantization for human motion and transfer 3D motion into motion tokens, +similar to the generation process of word tokens. Building upon this "motion +vocabulary", we perform language modeling on both motion and text in a unified +manner, treating human motion as a specific language. Moreover, inspired by +prompt learning, we pre-train MotionGPT with a mixture of motion-language data +and fine-tune it on prompt-based question-and-answer tasks. Extensive +experiments demonstrate that MotionGPT achieves state-of-the-art performances +on multiple motion tasks including text-driven motion generation, motion +captioning, motion prediction, and motion in-between. + +
+
+ comment: Project Page: https://github.com/OpenMotionLab/MotionGPT +
+
+
+
+
+ + ♻ ☆ PatternGPT :A Pattern-Driven Framework for Large Language Model Text + Generation + + +
+ Large language models(LLMS)have shown excellent text generation capabilities, +capable of generating fluent human-like responses for many downstream tasks. +However, applying large language models to real-world critical tasks remains +challenging due to their susceptibility to hallucinations and inability to +directly use external knowledge. To cope with the above challenges, this paper +proposes PatternGPT, a pattern-driven text generation framework for Large +Language Models. Firstly, the framework utilizes the extraction capability of +Large Language Models to generate rich and diversified structured and +formalized patterns, which facilitates the introduction of external knowledge +to do the computation, and then draws on the idea of federated learning to use +multiple agents to achieve the sharing in order to obtain more diversified +patterns, and finally uses judgment criteria and optimization algorithm to +search for high-quality patterns to guide the generation of models. Finally, +external knowledge such as judgment criteria and optimization algorithms are +used to search for high-quality patterns, and the searched patterns are used to +guide model generation. This framework has the advantages of generating +diversified patterns, protecting data privacy, combining external knowledge, +and improving the quality of generation, which provides an effective method to +optimize the text generation capability of large language models, and make it +better applied to the field of intelligent dialogue and content generation. + +
+
+
+
+
+ + ♻ ☆ LLMs as Workers in Human-Computational Algorithms? Replicating + Crowdsourcing Pipelines with LLMs + + +
+ LLMs have shown promise in replicating human-like behavior in crowdsourcing +tasks that were previously thought to be exclusive to human abilities. However, +current efforts focus mainly on simple atomic tasks. We explore whether LLMs +can replicate more complex crowdsourcing pipelines. We find that modern LLMs +can simulate some of crowdworkers' abilities in these "human computation +algorithms," but the level of success is variable and influenced by requesters' +understanding of LLM capabilities, the specific skills required for sub-tasks, +and the optimal interaction modality for performing these sub-tasks. We reflect +on human and LLMs' different sensitivities to instructions, stress the +importance of enabling human-facing safeguards for LLMs, and discuss the +potential of training humans and LLMs with complementary skill sets. Crucially, +we show that replicating crowdsourcing pipelines offers a valuable platform to +investigate (1) the relative strengths of LLMs on different tasks (by +cross-comparing their performances on sub-tasks) and (2) LLMs' potential in +complex tasks, where they can complete part of the tasks while leaving others +to humans. + +
+
+
+
+
+ + ♻ ☆ ChatGPT Chemistry Assistant for Text Mining and Prediction of MOF + Synthesis + + +
+ We use prompt engineering to guide ChatGPT in the automation of text mining +of metal-organic frameworks (MOFs) synthesis conditions from diverse formats +and styles of the scientific literature. This effectively mitigates ChatGPT's +tendency to hallucinate information -- an issue that previously made the use of +Large Language Models (LLMs) in scientific fields challenging. Our approach +involves the development of a workflow implementing three different processes +for text mining, programmed by ChatGPT itself. All of them enable parsing, +searching, filtering, classification, summarization, and data unification with +different tradeoffs between labor, speed, and accuracy. We deploy this system +to extract 26,257 distinct synthesis parameters pertaining to approximately 800 +MOFs sourced from peer-reviewed research articles. This process incorporates +our ChemPrompt Engineering strategy to instruct ChatGPT in text mining, +resulting in impressive precision, recall, and F1 scores of 90-99%. +Furthermore, with the dataset built by text mining, we constructed a +machine-learning model with over 86% accuracy in predicting MOF experimental +crystallization outcomes and preliminarily identifying important factors in MOF +crystallization. We also developed a reliable data-grounded MOF chatbot to +answer questions on chemical reactions and synthesis procedures. Given that the +process of using ChatGPT reliably mines and tabulates diverse MOF synthesis +information in a unified format, while using only narrative language requiring +no coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be +very useful across various other chemistry sub-disciplines. + +
+
+ comment: Published on Journal of the American Chemical Society (2023); 102 + pages (18-page manuscript, 84 pages of supporting information) +
+
+
+
+
+ + ♻ ☆ Unifying Token and Span Level Supervisions for Few-Shot Sequence + Labeling + + +
+ Few-shot sequence labeling aims to identify novel classes based on only a few +labeled samples. Existing methods solve the data scarcity problem mainly by +designing token-level or span-level labeling models based on metric learning. +However, these methods are only trained at a single granularity (i.e., either +token level or span level) and have some weaknesses of the corresponding +granularity. In this paper, we first unify token and span level supervisions +and propose a Consistent Dual Adaptive Prototypical (CDAP) network for few-shot +sequence labeling. CDAP contains the token-level and span-level networks, +jointly trained at different granularities. To align the outputs of two +networks, we further propose a consistent loss to enable them to learn from +each other. During the inference phase, we propose a consistent greedy +inference algorithm that first adjusts the predicted probability and then +greedily selects non-overlapping spans with maximum probability. Extensive +experiments show that our model achieves new state-of-the-art results on three +benchmark datasets. + +
+
+ comment: Accepted by ACM Transactions on Information Systems +
+
+
+
+
+ + ♻ ☆ Performance Comparison of Large Language Models on VNHSGE English + Dataset: OpenAI ChatGPT, Microsoft Bing Chat, and Google Bard + + +
+ This paper presents a performance comparison of three large language models +(LLMs), namely OpenAI ChatGPT, Microsoft Bing Chat (BingChat), and Google Bard, +on the VNHSGE English dataset. The performance of BingChat, Bard, and ChatGPT +(GPT-3.5) is 92.4\%, 86\%, and 79.2\%, respectively. The results show that +BingChat is better than ChatGPT and Bard. Therefore, BingChat and Bard can +replace ChatGPT while ChatGPT is not yet officially available in Vietnam. The +results also indicate that BingChat, Bard and ChatGPT outperform Vietnamese +students in English language proficiency. The findings of this study contribute +to the understanding of the potential of LLMs in English language education. +The remarkable performance of ChatGPT, BingChat, and Bard demonstrates their +potential as effective tools for teaching and learning English at the high +school level. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide + for Simultaneous Speech Translation + + +
+ Attention is the core mechanism of today's most used architectures for +natural language processing and has been analyzed from many perspectives, +including its effectiveness for machine translation-related tasks. Among these +studies, attention resulted to be a useful source of information to get +insights about word alignment also when the input text is substituted with +audio segments, as in the case of the speech translation (ST) task. In this +paper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that +exploits the attention information to generate source-target alignments that +guide the model during inference. Through experiments on the 8 language pairs +of MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art +SimulST policies applied to offline-trained models with gains in terms of BLEU +of 2 points and latency reductions ranging from 0.5s to 0.8s across the 8 +languages. + +
+
+ comment: Accepted at Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Efficient Guided Generation for Large Language Models + + +
+ In this article we describe an efficient approach to guiding language model +text generation with regular expressions and context-free grammars. Our +approach adds little to no overhead to the token sequence generation process, +and makes guided generation feasible in practice. An implementation is provided +in the open source Python library Outlines. + +
+
+
+
+
+ + ♻ ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization + Using Floating-Point Formats + + +
+ In the complex domain of large language models (LLMs), striking a balance +between computational efficiency and maintaining model quality is a formidable +challenge. Navigating the inherent limitations of uniform quantization, +particularly when dealing with outliers, and motivated by the launch of +NVIDIA's H100 hardware, this study delves into the viability of floating-point +(FP) quantization, particularly focusing on FP8 and FP4, as a potential +solution. Our comprehensive investigation reveals that for LLMs, FP8 activation +consistently outshines its integer (INT8) equivalent, with the performance edge +becoming more noticeable in models possessing parameters beyond one billion. +For weight quantization, our findings indicate that FP4 exhibits comparable, if +not superior, performance to INT4, simplifying deployment on FP-supported +hardware like H100. To mitigate the overhead from precision alignment caused by +the disparity between weights and activations, we propose two scaling +constraints for weight quantization that negligibly impact the performance +compared to the standard W4A8 model. We additionally enhance our quantization +methods by integrating the Low Rank Compensation (LoRC) strategy, yielding +improvements especially in smaller models. The results of our investigation +emphasize the immense potential of FP quantization for LLMs, paving the way for +high-efficiency deployment in resource-limited settings. + +
+
+
+
+
+ + ♻ ☆ FAIR: A Causal Framework for Accurately Inferring Judgments Reversals + + +
+ Artificial intelligence researchers have made significant advances in legal +intelligence in recent years. However, the existing studies have not focused on +the important value embedded in judgments reversals, which limits the +improvement of the efficiency of legal intelligence. In this paper, we propose +a causal Framework for Accurately Inferring case Reversals (FAIR), which models +the problem of judgments reversals based on real Chinese judgments. We mine the +causes of judgments reversals by causal inference methods and inject the +obtained causal relationships into the neural network as a priori knowledge. +And then, our framework is validated on a challenging dataset as a legal +judgment prediction task. The experimental results show that our framework can +tap the most critical factors in judgments reversal, and the obtained causal +relationships can effectively improve the neural network's performance. In +addition, we discuss the generalization ability of large language models for +legal intelligence tasks using ChatGPT as an example. Our experiment has found +that the generalization ability of large language models still has defects, and +mining causal relationships can effectively improve the accuracy and explain +ability of model predictions. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 136 + +
+
+
+ + ☆ PAPR: Proximity Attention Point Rendering + + +
+ Learning accurate and parsimonious point cloud representations of scene +surfaces from scratch remains a challenge in 3D representation learning. +Existing point-based methods often suffer from the vanishing gradient problem +or require a large number of points to accurately model scene geometry and +texture. To address these limitations, we propose Proximity Attention Point +Rendering (PAPR), a novel method that consists of a point-based scene +representation and a differentiable renderer. Our scene representation uses a +point cloud where each point is characterized by its spatial position, +foreground score, and view-independent feature vector. The renderer selects the +relevant points for each ray and produces accurate colours using their +associated features. PAPR effectively learns point cloud positions to represent +the correct scene geometry, even when the initialization drastically differs +from the target geometry. Notably, our method captures fine texture details +while using only a parsimonious set of points. We also demonstrate four +practical applications of our method: geometry editing, object manipulation, +texture transfer, and exposure control. More results and code are available on +our project website at https://zvict.github.io/papr/. + +
+
+
+
+
+ + ☆ Representation Learning in Anomaly Detection: Successes, Limits and a + Grand Challenge CVPR'23 + + +
+ In this perspective paper, we argue that the dominant paradigm in anomaly +detection cannot scale indefinitely and will eventually hit fundamental limits. +This is due to the a no free lunch principle for anomaly detection. These +limitations can be overcome when there are strong tasks priors, as is the case +for many industrial tasks. When such priors do not exists, the task is much +harder for anomaly detection. We pose two such tasks as grand challenges for +anomaly detection: i) scientific discovery by anomaly detection ii) a +"mini-grand" challenge of detecting the most anomalous image in the ImageNet +dataset. We believe new anomaly detection tools and ideas would need to be +developed to overcome these challenges. + +
+
+ comment: Keynote talk at the Visual Anomaly and Novelty Detection Workshop, + CVPR'23 +
+
+
+
+
+ + ☆ GLSFormer : Gated - Long, Short Sequence Transformer for Step + Recognition in Surgical Videos MICCAI 2023 + + +
+ Automated surgical step recognition is an important task that can +significantly improve patient safety and decision-making during surgeries. +Existing state-of-the-art methods for surgical step recognition either rely on +separate, multi-stage modeling of spatial and temporal information or operate +on short-range temporal resolution when learned jointly. However, the benefits +of joint modeling of spatio-temporal features and long-range information are +not taken in account. In this paper, we propose a vision transformer-based +approach to jointly learn spatio-temporal features directly from sequence of +frame-level patches. Our method incorporates a gated-temporal attention +mechanism that intelligently combines short-term and long-term spatio-temporal +feature representations. We extensively evaluate our approach on two cataract +surgery video datasets, namely Cataract-101 and D99, and demonstrate superior +performance compared to various state-of-the-art methods. These results +validate the suitability of our proposed approach for automated surgical step +recognition. Our code is released at: +https://github.com/nisargshah1999/GLSFormer + +
+
+ comment: Accepted to MICCAI 2023 (Early Accept) +
+
+
+
+
+ + ☆ AlignDet: Aligning Pre-training and Fine-tuning in Object Detection ICCV 2023 + + +
+ The paradigm of large-scale pre-training followed by downstream fine-tuning +has been widely employed in various object detection algorithms. In this paper, +we reveal discrepancies in data, model, and task between the pre-training and +fine-tuning procedure in existing practices, which implicitly limit the +detector's performance, generalization ability, and convergence speed. To this +end, we propose AlignDet, a unified pre-training framework that can be adapted +to various existing detectors to alleviate the discrepancies. AlignDet +decouples the pre-training process into two stages, i.e., image-domain and +box-domain pre-training. The image-domain pre-training optimizes the detection +backbone to capture holistic visual abstraction, and box-domain pre-training +learns instance-level semantics and task-aware concepts to initialize the parts +out of the backbone. By incorporating the self-supervised pre-trained +backbones, we can pre-train all modules for various detectors in an +unsupervised paradigm. As depicted in Figure 1, extensive experiments +demonstrate that AlignDet can achieve significant improvements across diverse +protocols, such as detection algorithm, model backbone, data setting, and +training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by +2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs. + +
+
+ comment: Accepted by ICCV 2023. Code and Models are publicly available. + Project Page: https://liming-ai.github.io/AlignDet +
+
+
+
+
+ + ☆ Learning Dense UV Completion for Human Mesh Recovery + + +
+ Human mesh reconstruction from a single image is challenging in the presence +of occlusion, which can be caused by self, objects, or other humans. Existing +methods either fail to separate human features accurately or lack proper +supervision for feature completion. In this paper, we propose Dense Inpainting +Human Mesh Recovery (DIMR), a two-stage method that leverages dense +correspondence maps to handle occlusion. Our method utilizes a dense +correspondence map to separate visible human features and completes human +features on a structured UV map dense human with an attention-based feature +completion module. We also design a feature inpainting training procedure that +guides the network to learn from unoccluded features. We evaluate our method on +several datasets and demonstrate its superior performance under heavily +occluded scenarios compared to other methods. Extensive experiments show that +our method obviously outperforms prior SOTA methods on heavily occluded images +and achieves comparable results on the standard benchmarks (3DPW). + +
+
+
+
+
+ + ☆ OBJECT 3DIT: Language-guided 3D-aware Image Editing + + +
+ Existing image editing tools, while powerful, typically disregard the +underlying 3D geometry from which the image is projected. As a result, edits +made using these tools may become detached from the geometry and lighting +conditions that are at the foundation of the image formation process. In this +work, we formulate the newt ask of language-guided 3D-aware editing, where +objects in an image should be edited according to a language instruction in +context of the underlying 3D scene. To promote progress towards this goal, we +release OBJECT: a dataset consisting of 400K editing examples created from +procedurally generated 3D scenes. Each example consists of an input image, +editing instruction in language, and the edited image. We also introduce 3DIT : +single and multi-task models for four editing tasks. Our models show impressive +abilities to understand the 3D composition of entire scenes, factoring in +surrounding objects, surfaces, lighting conditions, shadows, and +physically-plausible object configurations. Surprisingly, training on only +synthetic scenes from OBJECT, editing capabilities of 3DIT generalize to +real-world images. + +
+
+
+
+
+ + ☆ CNOS: A Strong Baseline for CAD-based Novel Object Segmentation + + +
+ We propose a simple three-stage approach to segment unseen objects in RGB +images using their CAD models. Leveraging recent powerful foundation models, +DINOv2 and Segment Anything, we create descriptors and generate proposals, +including binary masks for a given input RGB image. By matching proposals with +reference descriptors created from CAD models, we achieve precise object ID +assignment along with modal masks. We experimentally demonstrate that our +method achieves state-of-the-art results in CAD-based novel object +segmentation, surpassing existing approaches on the seven core datasets of the +BOP challenge by 19.8\% AP using the same BOP evaluation protocol. Our source +code is available at https://github.com/nv-nguyen/cnos. + +
+
+
+
+
+ + ☆ Driving Policy Prediction based on Deep Learning Models + + +
+ In this project, we implemented an end-to-end system that takes in combined +visual features of video frames from a normal camera and depth information from +a cloud points scanner, and predicts driving policies (vehicle speed and +steering angle). We verified the safety of our system by comparing the +predicted results with standard behaviors by real-world experienced drivers. +Our test results show that the predictions can be considered as accurate in at +lease half of the testing cases (50% 80%, depending on the model), and using +combined features improved the performance in most cases than using video +frames only. + +
+
+ comment: 5 pages, 9 figures +
+
+
+
+
+ + ☆ HRFNet: High-Resolution Forgery Network for Localizing Satellite Image + Manipulation ICIP 2023 + + +
+ Existing high-resolution satellite image forgery localization methods rely on +patch-based or downsampling-based training. Both of these training methods have +major drawbacks, such as inaccurate boundaries between pristine and forged +regions, the generation of unwanted artifacts, etc. To tackle the +aforementioned challenges, inspired by the high-resolution image segmentation +literature, we propose a novel model called HRFNet to enable satellite image +forgery localization effectively. Specifically, equipped with shallow and deep +branches, our model can successfully integrate RGB and resampling features in +both global and local manners to localize forgery more accurately. We perform +various experiments to demonstrate that our method achieves the best +performance, while the memory requirement and processing speed are not +compromised compared to existing methods. + +
+
+ comment: ICIP 2023 +
+
+
+
+
+ + ☆ Cascade-DETR: Delving into High-Quality Universal Object Detection ICCV 2023 + + +
+ Object localization in general environments is a fundamental part of vision +systems. While dominating on the COCO benchmark, recent Transformer-based +detection methods are not competitive in diverse domains. Moreover, these +methods still struggle to very accurately estimate the object bounding boxes in +complex environments. + We introduce Cascade-DETR for high-quality universal object detection. We +jointly tackle the generalization to diverse domains and localization accuracy +by proposing the Cascade Attention layer, which explicitly integrates +object-centric information into the detection decoder by limiting the attention +to the previous box prediction. To further enhance accuracy, we also revisit +the scoring of queries. Instead of relying on classification scores, we predict +the expected IoU of the query, leading to substantially more well-calibrated +confidences. Lastly, we introduce a universal object detection benchmark, +UDB10, that contains 10 datasets from diverse domains. While also advancing the +state-of-the-art on COCO, Cascade-DETR substantially improves DETR-based +detectors on all datasets in UDB10, even by over 10 mAP in some cases. The +improvements under stringent quality requirements are even more pronounced. Our +code and models will be released at https://github.com/SysCV/cascade-detr. + +
+
+ comment: Accepted in ICCV 2023. Our code and models will be released at + https://github.com/SysCV/cascade-detr +
+
+
+
+
+ + ☆ Multi-objective point cloud autoencoders for explainable myocardial + infarction prediction + + +
+ Myocardial infarction (MI) is one of the most common causes of death in the +world. Image-based biomarkers commonly used in the clinic, such as ejection +fraction, fail to capture more complex patterns in the heart's 3D anatomy and +thus limit diagnostic accuracy. In this work, we present the multi-objective +point cloud autoencoder as a novel geometric deep learning approach for +explainable infarction prediction, based on multi-class 3D point cloud +representations of cardiac anatomy and function. Its architecture consists of +multiple task-specific branches connected by a low-dimensional latent space to +allow for effective multi-objective learning of both reconstruction and MI +prediction, while capturing pathology-specific 3D shape information in an +interpretable latent space. Furthermore, its hierarchical branch design with +point cloud-based deep learning operations enables efficient multi-scale +feature learning directly on high-resolution anatomy point clouds. In our +experiments on a large UK Biobank dataset, the multi-objective point cloud +autoencoder is able to accurately reconstruct multi-temporal 3D shapes with +Chamfer distances between predicted and input anatomies below the underlying +images' pixel resolution. Our method outperforms multiple machine learning and +deep learning benchmarks for the task of incident MI prediction by 19% in terms +of Area Under the Receiver Operating Characteristic curve. In addition, its +task-specific compact latent space exhibits easily separable control and MI +clusters with clinically plausible associations between subject encodings and +corresponding 3D shapes, thus demonstrating the explainability of the +prediction. + +
+
+
+
+
+ + ☆ Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image ICCV 2023 + + +
+ Reconstructing accurate 3D scenes from images is a long-standing vision task. +Due to the ill-posedness of the single-image reconstruction problem, most +well-established methods are built upon multi-view geometry. State-of-the-art +(SOTA) monocular metric depth estimation methods can only handle a single +camera model and are unable to perform mixed-data training due to the metric +ambiguity. Meanwhile, SOTA monocular methods trained on large mixed datasets +achieve zero-shot generalization by learning affine-invariant depths, which +cannot recover real-world metrics. In this work, we show that the key to a +zero-shot single-view metric depth model lies in the combination of large-scale +data training and resolving the metric ambiguity from various camera models. We +propose a canonical camera space transformation module, which explicitly +addresses the ambiguity problems and can be effortlessly plugged into existing +monocular models. Equipped with our module, monocular models can be stably +trained with over 8 million images with thousands of camera models, resulting +in zero-shot generalization to in-the-wild images with unseen camera settings. +Experiments demonstrate SOTA performance of our method on 7 zero-shot +benchmarks. Notably, our method won the championship in the 2nd Monocular Depth +Estimation Challenge. Our method enables the accurate recovery of metric 3D +structures on randomly collected internet images, paving the way for plausible +single-image metrology. The potential benefits extend to downstream tasks, +which can be significantly improved by simply plugging in our model. For +example, our model relieves the scale drift issues of monocular-SLAM (Fig. 1), +leading to high-quality metric scale dense mapping. The code is available at +https://github.com/YvanYin/Metric3D. + +
+
+ comment: Accepted to ICCV 2023. Won the championship in the 2nd Monocular + Depth Estimation Challenge. The code is available at + https://github.com/YvanYin/Metric3D +
+
+
+
+
+ + ☆ Deep Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: 22 pages, 5 figures +
+
+
+
+
+ + ☆ Spinal nerve segmentation method and dataset construction in endoscopic + surgical scenarios MICCAI 2023 + + +
+ Endoscopic surgery is currently an important treatment method in the field of +spinal surgery and avoiding damage to the spinal nerves through video guidance +is a key challenge. This paper presents the first real-time segmentation method +for spinal nerves in endoscopic surgery, which provides crucial navigational +information for surgeons. A finely annotated segmentation dataset of +approximately 10,000 consec-utive frames recorded during surgery is constructed +for the first time for this field, addressing the problem of semantic +segmentation. Based on this dataset, we propose FUnet (Frame-Unet), which +achieves state-of-the-art performance by utilizing inter-frame information and +self-attention mechanisms. We also conduct extended exper-iments on a similar +polyp endoscopy video dataset and show that the model has good generalization +ability with advantageous performance. The dataset and code of this work are +presented at: https://github.com/zzzzzzpc/FUnet . + +
+
+ comment: Accepted by MICCAI 2023 +
+
+
+
+
+ + ☆ Soft-tissue Driven Craniomaxillofacial Surgical Planning MICCAI 2023 + + +
+ In CMF surgery, the planning of bony movement to achieve a desired facial +outcome is a challenging task. Current bone driven approaches focus on +normalizing the bone with the expectation that the facial appearance will be +corrected accordingly. However, due to the complex non-linear relationship +between bony structure and facial soft-tissue, such bone-driven methods are +insufficient to correct facial deformities. Despite efforts to simulate facial +changes resulting from bony movement, surgical planning still relies on +iterative revisions and educated guesses. To address these issues, we propose a +soft-tissue driven framework that can automatically create and verify surgical +plans. Our framework consists of a bony planner network that estimates the bony +movements required to achieve the desired facial outcome and a facial simulator +network that can simulate the possible facial changes resulting from the +estimated bony movement plans. By combining these two models, we can verify and +determine the final bony movement required for planning. The proposed framework +was evaluated using a clinical dataset, and our experimental results +demonstrate that the soft-tissue driven approach greatly improves the accuracy +and efficacy of surgical planning when compared to the conventional bone-driven +approach. + +
+
+ comment: Early accepted by MICCAI 2023 +
+
+
+
+
+ + ☆ PE-YOLO: Pyramid Enhancement Network for Dark Object Detection ICANN 2023 + + +
+ Current object detection models have achieved good results on many benchmark +datasets, detecting objects in dark conditions remains a large challenge. To +address this issue, we propose a pyramid enhanced network (PENet) and joint it +with YOLOv3 to build a dark object detection framework named PE-YOLO. Firstly, +PENet decomposes the image into four components of different resolutions using +the Laplacian pyramid. Specifically we propose a detail processing module (DPM) +to enhance the detail of images, which consists of context branch and edge +branch. In addition, we propose a low-frequency enhancement filter (LEF) to +capture low-frequency semantics and prevent high-frequency noise. PE-YOLO +adopts an end-to-end joint training approach and only uses normal detection +loss to simplify the training process. We conduct experiments on the low-light +object detection dataset ExDark to demonstrate the effectiveness of ours. The +results indicate that compared with other dark detectors and low-light +enhancement models, PE-YOLO achieves the advanced results, achieving 78.0% in +mAP and 53.6 in FPS, respectively, which can adapt to object detection under +different low-light conditions. The code is available at +https://github.com/XiangchenYin/PE-YOLO. + +
+
+ comment: Accepted at ICANN 2023 +
+
+
+
+
+ + ☆ Improving Online Lane Graph Extraction by Object-Lane Clustering ICCV 2023 + + +
+ Autonomous driving requires accurate local scene understanding information. +To this end, autonomous agents deploy object detection and online BEV lane +graph extraction methods as a part of their perception stack. In this work, we +propose an architecture and loss formulation to improve the accuracy of local +lane graph estimates by using 3D object detection outputs. The proposed method +learns to assign the objects to centerlines by considering the centerlines as +cluster centers and the objects as data points to be assigned a probability +distribution over the cluster centers. This training scheme ensures direct +supervision on the relationship between lanes and objects, thus leading to +better performance. The proposed method improves lane graph estimation +substantially over state-of-the-art methods. The extensive ablations show that +our method can achieve significant performance improvements by using the +outputs of existing 3D object detection methods. Since our method uses the +detection outputs rather than detection method intermediate representations, a +single model of our method can use any detection method at test time. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Proxy Anchor-based Unsupervised Learning for Continuous Generalized + Category Discovery ICCV 2023 + + +
+ Recent advances in deep learning have significantly improved the performance +of various computer vision applications. However, discovering novel categories +in an incremental learning scenario remains a challenging problem due to the +lack of prior knowledge about the number and nature of new categories. Existing +methods for novel category discovery are limited by their reliance on labeled +datasets and prior knowledge about the number of novel categories and the +proportion of novel samples in the batch. To address the limitations and more +accurately reflect real-world scenarios, in this paper, we propose a novel +unsupervised class incremental learning approach for discovering novel +categories on unlabeled sets without prior knowledge. The proposed method +fine-tunes the feature extractor and proxy anchors on labeled sets, then splits +samples into old and novel categories and clusters on the unlabeled dataset. +Furthermore, the proxy anchors-based exemplar generates representative category +vectors to mitigate catastrophic forgetting. Experimental results demonstrate +that our proposed approach outperforms the state-of-the-art methods on +fine-grained datasets under real-world scenarios. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ OCTraN: 3D Occupancy Convolutional Transformer Network in Unstructured + Traffic Scenarios CVPR 2023 + + +
+ Modern approaches for vision-centric environment perception for autonomous +navigation make extensive use of self-supervised monocular depth estimation +algorithms that output disparity maps. However, when this disparity map is +projected onto 3D space, the errors in disparity are magnified, resulting in a +depth estimation error that increases quadratically as the distance from the +camera increases. Though Light Detection and Ranging (LiDAR) can solve this +issue, it is expensive and not feasible for many applications. To address the +challenge of accurate ranging with low-cost sensors, we propose, OCTraN, a +transformer architecture that uses iterative-attention to convert 2D image +features into 3D occupancy features and makes use of convolution and transpose +convolution to efficiently operate on spatial information. We also develop a +self-supervised training pipeline to generalize the model to any scene by +eliminating the need for LiDAR ground truth by substituting it with +pseudo-ground truth labels obtained from boosted monocular depth estimation. + +
+
+ comment: This work was accepted as a spotlight presentation at the + Transformers for Vision Workshop @CVPR 2023 +
+
+
+
+
+ + ☆ Modeling 3D cardiac contraction and relaxation with point cloud + deformation networks + + +
+ Global single-valued biomarkers of cardiac function typically used in +clinical practice, such as ejection fraction, provide limited insight on the +true 3D cardiac deformation process and hence, limit the understanding of both +healthy and pathological cardiac mechanics. In this work, we propose the Point +Cloud Deformation Network (PCD-Net) as a novel geometric deep learning approach +to model 3D cardiac contraction and relaxation between the extreme ends of the +cardiac cycle. It employs the recent advances in point cloud-based deep +learning into an encoder-decoder structure, in order to enable efficient +multi-scale feature learning directly on multi-class 3D point cloud +representations of the cardiac anatomy. We evaluate our approach on a large +dataset of over 10,000 cases from the UK Biobank study and find average Chamfer +distances between the predicted and ground truth anatomies below the pixel +resolution of the underlying image acquisition. Furthermore, we observe similar +clinical metrics between predicted and ground truth populations and show that +the PCD-Net can successfully capture subpopulation-specific differences between +normal subjects and myocardial infarction (MI) patients. We then demonstrate +that the learned 3D deformation patterns outperform multiple clinical +benchmarks by 13% and 7% in terms of area under the receiver operating +characteristic curve for the tasks of prevalent MI detection and incident MI +prediction and by 7% in terms of Harrell's concordance index for MI survival +analysis. + +
+
+
+
+
+ + ☆ Confidence intervals for performance estimates in 3D medical image + segmentation + + +
+ Medical segmentation models are evaluated empirically. As such an evaluation +is based on a limited set of example images, it is unavoidably noisy. Beyond a +mean performance measure, reporting confidence intervals is thus crucial. +However, this is rarely done in medical image segmentation. The width of the +confidence interval depends on the test set size and on the spread of the +performance measure (its standard-deviation across of the test set). For +classification, many test images are needed to avoid wide confidence intervals. +Segmentation, however, has not been studied, and it differs by the amount of +information brought by a given test image. In this paper, we study the typical +confidence intervals in medical image segmentation. We carry experiments on 3D +image segmentation using the standard nnU-net framework, two datasets from the +Medical Decathlon challenge and two performance measures: the Dice accuracy and +the Hausdorff distance. We show that the parametric confidence intervals are +reasonable approximations of the bootstrap estimates for varying test set sizes +and spread of the performance metric. Importantly, we show that the test size +needed to achieve a given precision is often much lower than for classification +tasks. Typically, a 1% wide confidence interval requires about 100-200 test +samples when the spread is low (standard-deviation around 3%). More difficult +segmentation tasks may lead to higher spreads and require over 1000 samples. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Intrinsic Appearance Decomposition Using Point Cloud Representation + + +
+ Intrinsic decomposition is to infer the albedo and shading from the image. +Since it is a heavily ill-posed problem, previous methods rely on prior +assumptions from 2D images, however, the exploration of the data representation +itself is limited. The point cloud is known as a rich format of scene +representation, which naturally aligns the geometric information and the color +information of an image. Our proposed method, Point Intrinsic Net, in short, +PoInt-Net, jointly predicts the albedo, light source direction, and shading, +using point cloud representation. Experiments reveal the benefits of PoInt-Net, +in terms of accuracy, it outperforms 2D representation approaches on multiple +metrics across datasets; in terms of efficiency, it trains on small-scale point +clouds and performs stably on any-scale point clouds; in terms of robustness, +it only trains on single object level dataset, and demonstrates reasonable +generalization ability for unseen objects and scenes. + +
+
+ comment: 14 pages, 14 figures +
+
+
+
+
+ + ☆ Language-based Action Concept Spaces Improve Video Self-Supervised + Learning + + +
+ Recent contrastive language image pre-training has led to learning highly +transferable and robust image representations. However, adapting these models +to video domains with minimal supervision remains an open problem. We explore a +simple step in that direction, using language tied self-supervised learning to +adapt an image CLIP model to the video domain. A backbone modified for temporal +modeling is trained under self-distillation settings with train objectives +operating in an action concept space. Feature vectors of various action +concepts extracted from a language encoder using relevant textual prompts +construct this space. We introduce two train objectives, concept distillation +and concept alignment, that retain generality of original representations while +enforcing relations between actions and their attributes. Our approach improves +zero-shot and linear probing performance on three action recognition +benchmarks. + +
+
+
+
+
+ + ☆ Revisiting Fine-Tuning Strategies for Self-supervised Medical Imaging + Analysis + + +
+ Despite the rapid progress in self-supervised learning (SSL), end-to-end +fine-tuning still remains the dominant fine-tuning strategy for medical imaging +analysis. However, it remains unclear whether this approach is truly optimal +for effectively utilizing the pre-trained knowledge, especially considering the +diverse categories of SSL that capture different types of features. In this +paper, we first establish strong contrastive and restorative SSL baselines that +outperform SOTA methods across four diverse downstream tasks. Building upon +these strong baselines, we conduct an extensive fine-tuning analysis across +multiple pre-training and fine-tuning datasets, as well as various fine-tuning +dataset sizes. Contrary to the conventional wisdom of fine-tuning only the last +few layers of a pre-trained network, we show that fine-tuning intermediate +layers is more effective, with fine-tuning the second quarter (25-50%) of the +network being optimal for contrastive SSL whereas fine-tuning the third quarter +(50-75%) of the network being optimal for restorative SSL. Compared to the +de-facto standard of end-to-end fine-tuning, our best fine-tuning strategy, +which fine-tunes a shallower network consisting of the first three quarters +(0-75%) of the pre-trained network, yields improvements of as much as 5.48%. +Additionally, using these insights, we propose a simple yet effective method to +leverage the complementary strengths of multiple SSL models, resulting in +enhancements of up to 3.57% compared to using the best model alone. Hence, our +fine-tuning strategies not only enhance the performance of individual SSL +models, but also enable effective utilization of the complementary strengths +offered by multiple SSL models, leading to significant improvements in +self-supervised medical imaging analysis. + +
+
+
+
+
+ + ☆ WeakPolyp: You Only Look Bounding Box for Polyp Segmentation MICCAI 2023 + + +
+ Limited by expensive pixel-level labels, polyp segmentation models are +plagued by data shortage and suffer from impaired generalization. In contrast, +polyp bounding box annotations are much cheaper and more accessible. Thus, to +reduce labeling cost, we propose to learn a weakly supervised polyp +segmentation model (i.e., WeakPolyp) completely based on bounding box +annotations. However, coarse bounding boxes contain too much noise. To avoid +interference, we introduce the mask-to-box (M2B) transformation. By supervising +the outer box mask of the prediction instead of the prediction itself, M2B +greatly mitigates the mismatch between the coarse label and the precise +prediction. But, M2B only provides sparse supervision, leading to non-unique +predictions. Therefore, we further propose a scale consistency (SC) loss for +dense supervision. By explicitly aligning predictions across the same image at +different scales, the SC loss largely reduces the variation of predictions. +Note that our WeakPolyp is a plug-and-play model, which can be easily ported to +other appealing backbones. Besides, the proposed modules are only used during +training, bringing no computation cost to inference. Extensive experiments +demonstrate the effectiveness of our proposed WeakPolyp, which surprisingly +achieves a comparable performance with a fully supervised model, requiring no +mask annotations at all. + +
+
+ comment: accepted by MICCAI 2023, codes are available at + https://github.com/weijun88/WeakPolyp +
+
+
+
+
+ + ☆ Variational Point Encoding Deformation for Dental Modeling + + +
+ Digital dentistry has made significant advancements in recent years, yet +numerous challenges remain to be addressed. In this study, we release a new +extensive dataset of tooth meshes to encourage further research. Additionally, +we propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable +probabilistic learning of point cloud representations. A key challenge in +existing latent variable models for point clouds is the lack of a 1-to-1 +mapping between input points and output points. Instead, they must rely on +optimizing Chamfer distances, a metric that does not have a normalized +distributional counterpart, preventing its usage in probabilistic models. We +demonstrate that explicit minimization of Chamfer distances can be replaced by +a suitable encoder, which allows us to increase computational efficiency while +simplifying the probabilistic extension. Our experimental findings present +empirical evidence demonstrating the superior performance of VF-Net over +existing models in terms of dental scan reconstruction and extrapolation. +Additionally, our investigation highlights the robustness of VF-Net's latent +representations. These results underscore the promising prospects of VF-Net as +an effective and reliable method for point cloud reconstruction and analysis. + +
+
+
+
+
+ + ☆ Human Motion Generation: A Survey + + +
+ Human motion generation aims to generate natural human pose sequences and +shows immense potential for real-world applications. Substantial progress has +been made recently in motion data collection technologies and generation +methods, laying the foundation for increasing interest in human motion +generation. Most research within this field focuses on generating human motions +based on conditional signals, such as text, audio, and scene contexts. While +significant advancements have been made in recent years, the task continues to +pose challenges due to the intricate nature of human motion and its implicit +relationship with conditional signals. In this survey, we present a +comprehensive literature review of human motion generation, which, to the best +of our knowledge, is the first of its kind in this field. We begin by +introducing the background of human motion and generative models, followed by +an examination of representative methods for three mainstream sub-tasks: +text-conditioned, audio-conditioned, and scene-conditioned human motion +generation. Additionally, we provide an overview of common datasets and +evaluation metrics. Lastly, we discuss open problems and outline potential +future research directions. We hope that this survey could provide the +community with a comprehensive glimpse of this rapidly evolving field and +inspire novel ideas that address the outstanding challenges. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification + + +
+ The popularity of point cloud deep models for safety-critical purposes has +increased, but the reliability and security of these models can be compromised +by intentional or naturally occurring point cloud noise. To combat this issue, +we present a novel point cloud outlier removal method called PointCVaR, which +empowers standard-trained models to eliminate additional outliers and restore +the data. Our approach begins by conducting attribution analysis to determine +the influence of each point on the model output, which we refer to as point +risk. We then optimize the process of filtering high-risk points using +Conditional Value at Risk (CVaR) as the objective. The rationale for this +approach is based on the observation that noise points in point clouds tend to +cluster in the tail of the risk distribution, with a low frequency but a high +level of risk, resulting in significant interference with classification +results. Despite requiring no additional training effort, our method produces +exceptional results in various removal-and-classification experiments for noisy +point clouds, which are corrupted by random noise, adversarial noise, and +backdoor trigger noise. Impressively, it achieves 87% accuracy in defense +against the backdoor attack by removing triggers. Overall, the proposed +PointCVaR effectively eliminates noise points and enhances point cloud +classification, making it a promising plug-in module for various models in +different scenarios. + +
+
+
+
+
+ + ☆ Conservative Estimation of Perception Relevance of Dynamic Objects for + Safe Trajectories in Automotive Scenarios + + +
+ Having efficient testing strategies is a core challenge that needs to be +overcome for the release of automated driving. This necessitates clear +requirements as well as suitable methods for testing. In this work, the +requirements for perception modules are considered with respect to relevance. +The concept of relevance currently remains insufficiently defined and +specified. In this paper, we propose a novel methodology to overcome this +challenge by exemplary application to collision safety in the highway domain. +Using this general system and use case specification, a corresponding concept +for relevance is derived. Irrelevant objects are thus defined as objects which +do not limit the set of safe actions available to the ego vehicle under +consideration of all uncertainties. As an initial step, the use case is +decomposed into functional scenarios with respect to collision relevance. For +each functional scenario, possible actions of both the ego vehicle and any +other dynamic object are formalized as equations. This set of possible actions +is constrained by traffic rules, yielding relevance criteria. As a result, we +present a conservative estimation which dynamic objects are relevant for +perception and need to be considered for a complete evaluation. The estimation +provides requirements which are applicable for offline testing and validation +of perception components. A visualization is presented for examples from the +highD dataset, showing the plausibility of the results. Finally, a possibility +for a future validation of the presented relevance concept is outlined. + +
+
+
+
+
+ + ☆ FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with + Human Feedback + + +
+ Captions are crucial for understanding scientific visualizations and +documents. Existing captioning methods for scientific figures rely on +figure-caption pairs extracted from documents for training, many of which fall +short with respect to metrics like helpfulness, explainability, and +visual-descriptiveness [15] leading to generated captions being misaligned with +reader preferences. To enable the generation of high-quality figure captions, +we introduce FigCaps-HF a new framework for figure-caption generation that can +incorporate domain expert feedback in generating captions optimized for reader +preferences. Our framework comprises of 1) an automatic method for evaluating +quality of figure-caption pairs, 2) a novel reinforcement learning with human +feedback (RLHF) method to optimize a generative figure-to-caption model for +reader preferences. We demonstrate the effectiveness of our simple learning +framework by improving performance over standard fine-tuning across different +types of models. In particular, when using BLIP as the base model, our RLHF +framework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and +Meteor, respectively. Finally, we release a large-scale benchmark dataset with +human feedback on figure-caption pairs to enable further evaluation and +development of RLHF techniques for this problem. + +
+
+ comment: 19 pages, 4 figures. Benchmark Documentation: + https://figcapshf.github.io/ +
+
+
+
+
+ + ☆ Divide & Bind Your Attention for Improved Generative Semantic Nursing + + +
+ Emerging large-scale text-to-image generative models, e.g., Stable Diffusion +(SD), have exhibited overwhelming results with high fidelity. Despite the +magnificent progress, current state-of-the-art models still struggle to +generate images fully adhering to the input prompt. Prior work, Attend & +Excite, has introduced the concept of Generative Semantic Nursing (GSN), aiming +to optimize cross-attention during inference time to better incorporate the +semantics. It demonstrates promising results in generating simple prompts, +e.g., ``a cat and a dog''. However, its efficacy declines when dealing with +more complex prompts, and it does not explicitly address the problem of +improper attribute binding. To address the challenges posed by complex prompts +or scenarios involving multiple entities and to achieve improved attribute +binding, we propose Divide & Bind. We introduce two novel loss objectives for +GSN: a novel attendance loss and a binding loss. Our approach stands out in its +ability to faithfully synthesize desired objects with improved attribute +alignment from complex prompts and exhibits superior performance across +multiple evaluation benchmarks. More videos and updates can be found on the +project page \url{https://sites.google.com/view/divide-and-bind}. + +
+
+ comment: Project page: \url{https://sites.google.com/view/divide-and-bind} +
+
+
+
+
+ + ☆ BlendFace: Re-designing Identity Encoders for Face-Swapping ICCV2023 + + +
+ The great advancements of generative adversarial networks and face +recognition models in computer vision have made it possible to swap identities +on images from single sources. Although a lot of studies seems to have proposed +almost satisfactory solutions, we notice previous methods still suffer from an +identity-attribute entanglement that causes undesired attributes swapping +because widely used identity encoders, eg, ArcFace, have some crucial attribute +biases owing to their pretraining on face recognition tasks. To address this +issue, we design BlendFace, a novel identity encoder for face-swapping. The key +idea behind BlendFace is training face recognition models on blended images +whose attributes are replaced with those of another mitigates inter-personal +biases such as hairsyles. BlendFace feeds disentangled identity features into +generators and guides generators properly as an identity loss function. +Extensive experiments demonstrate that BlendFace improves the +identity-attribute disentanglement in face-swapping models, maintaining a +comparable quantitative performance to previous methods. + +
+
+ comment: ICCV2023. Code: https://github.com/mapooon/BlendFace, Webpage: + https://mapooon.github.io/BlendFacePage/ +
+
+
+
+
+ + ☆ Exploring Effective Priors and Efficient Models for Weakly-Supervised + Change Detection + + +
+ Weakly-supervised change detection (WSCD) aims to detect pixel-level changes +with only image-level annotations. Owing to its label efficiency, WSCD is +drawing increasing attention recently. However, current WSCD methods often +encounter the challenge of change missing and fabricating, i.e., the +inconsistency between image-level annotations and pixel-level predictions. +Specifically, change missing refer to the situation that the WSCD model fails +to predict any changed pixels, even though the image-level label indicates +changed, and vice versa for change fabricating. To address this challenge, in +this work, we leverage global-scale and local-scale priors in WSCD and propose +two components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint. +The DP decoder decodes samples with the changed image-level label, skips +samples with the unchanged label, and replaces them with an all-unchanged +pixel-level label. The LG constraint is derived from the correspondence between +changed representations and image-level labels, penalizing the model when it +mispredicts the change status. Additionally, we develop TransWCD, a simple yet +powerful transformer-based model, showcasing the potential of weakly-supervised +learning in change detection. By integrating the DP decoder and LG constraint +into TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL +achieve significant +6.33% and +9.55% F1 score improvements over the +state-of-the-art methods on the WHU-CD dataset, respectively. Some performance +metrics even exceed several fully-supervised change detection (FSCD) +competitors. Code will be available at +https://github.com/zhenghuizhao/TransWCD. + +
+
+
+
+
+ + ☆ Self-paced Weight Consolidation for Continual Learning + + +
+ Continual learning algorithms which keep the parameters of new tasks close to +that of previous tasks, are popular in preventing catastrophic forgetting in +sequential task learning settings. However, 1) the performance for the new +continual learner will be degraded without distinguishing the contributions of +previously learned tasks; 2) the computational cost will be greatly increased +with the number of tasks, since most existing algorithms need to regularize all +previous tasks when learning new tasks. To address the above challenges, we +propose a self-paced Weight Consolidation (spWC) framework to attain robust +continual learning via evaluating the discriminative contributions of previous +tasks. To be specific, we develop a self-paced regularization to reflect the +priorities of past tasks via measuring difficulty based on key performance +indicator (i.e., accuracy). When encountering a new task, all previous tasks +are sorted from "difficult" to "easy" based on the priorities. Then the +parameters of the new continual learner will be learned via selectively +maintaining the knowledge amongst more difficult past tasks, which could well +overcome catastrophic forgetting with less computational cost. We adopt an +alternative convex search to iteratively update the model parameters and +priority weights in the bi-convex formulation. The proposed spWC framework is +plug-and-play, which is applicable to most continual learning algorithms (e.g., +EWC, MAS and RCIL) in different directions (e.g., classification and +segmentation). Experimental results on several public benchmark datasets +demonstrate that our proposed framework can effectively improve performance +when compared with other popular continual learning algorithms. + +
+
+
+
+
+ + ☆ Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals + for GPM: A U-Net Convolutional LSTM Architecture + + +
+ This paper presents a deep learning architecture for nowcasting of +precipitation almost globally every 30 min with a 4-hour lead time. The +architecture fuses a U-Net and a convolutional long short-term memory (LSTM) +neural network and is trained using data from the Integrated MultisatellitE +Retrievals for GPM (IMERG) and a few key precipitation drivers from the Global +Forecast System (GFS). The impacts of different training loss functions, +including the mean-squared error (regression) and the focal-loss +(classification), on the quality of precipitation nowcasts are studied. The +results indicate that the regression network performs well in capturing light +precipitation (below 1.6 mm/hr), but the classification network can outperform +the regression network for nowcasting of precipitation extremes (>8 mm/hr), in +terms of the critical success index (CSI).. Using the Wasserstein distance, it +is shown that the predicted precipitation by the classification network has a +closer class probability distribution to the IMERG than the regression network. +It is uncovered that the inclusion of the physical variables can improve +precipitation nowcasting, especially at longer lead times in both networks. +Taking IMERG as a relative reference, a multi-scale analysis in terms of +fractions skill score (FSS), shows that the nowcasting machine remains skillful +(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For +precipitation rates greater than 4~mm/hr, only the classification network +remains FSS-skillful on scales greater than 50 km within a 2-hour lead time. + +
+
+
+
+
+ + ☆ Label Calibration for Semantic Segmentation Under Domain Shift ICLR 2023 + + +
+ Performance of a pre-trained semantic segmentation model is likely to +substantially decrease on data from a new domain. We show a pre-trained model +can be adapted to unlabelled target domain data by calculating soft-label +prototypes under the domain shift and making predictions according to the +prototype closest to the vector with predicted class probabilities. The +proposed adaptation procedure is fast, comes almost for free in terms of +computational resources and leads to considerable performance improvements. We +demonstrate the benefits of such label calibration on the highly-practical +synthetic-to-real semantic segmentation problem. + +
+
+ comment: ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for + Trustworthy ML +
+
+
+
+
+ + ☆ Parse and Recall: Towards Accurate Lung Nodule Malignancy Prediction + like Radiologists MICCAI 2023 + + +
+ Lung cancer is a leading cause of death worldwide and early screening is +critical for improving survival outcomes. In clinical practice, the contextual +structure of nodules and the accumulated experience of radiologists are the two +core elements related to the accuracy of identification of benign and malignant +nodules. Contextual information provides comprehensive information about +nodules such as location, shape, and peripheral vessels, and experienced +radiologists can search for clues from previous cases as a reference to enrich +the basis of decision-making. In this paper, we propose a radiologist-inspired +method to simulate the diagnostic process of radiologists, which is composed of +context parsing and prototype recalling modules. The context parsing module +first segments the context structure of nodules and then aggregates contextual +information for a more comprehensive understanding of the nodule. The prototype +recalling module utilizes prototype-based learning to condense previously +learned cases as prototypes for comparative analysis, which is updated online +in a momentum way during training. Building on the two modules, our method +leverages both the intrinsic characteristics of the nodules and the external +knowledge accumulated from other nodules to achieve a sound diagnosis. To meet +the needs of both low-dose and noncontrast screening, we collect a large-scale +dataset of 12,852 and 4,029 nodules from low-dose and noncontrast CTs +respectively, each with pathology- or follow-up-confirmed labels. Experiments +on several datasets demonstrate that our method achieves advanced screening +performance on both low-dose and noncontrast scenarios. + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ☆ Gradient-Semantic Compensation for Incremental Semantic Segmentation + + +
+ Incremental semantic segmentation aims to continually learn the segmentation +of new coming classes without accessing the training data of previously learned +classes. However, most current methods fail to address catastrophic forgetting +and background shift since they 1) treat all previous classes equally without +considering different forgetting paces caused by imbalanced gradient +back-propagation; 2) lack strong semantic guidance between classes. To tackle +the above challenges, in this paper, we propose a Gradient-Semantic +Compensation (GSC) model, which surmounts incremental semantic segmentation +from both gradient and semantic perspectives. Specifically, to address +catastrophic forgetting from the gradient aspect, we develop a step-aware +gradient compensation that can balance forgetting paces of previously seen +classes via re-weighting gradient backpropagation. Meanwhile, we propose a +soft-sharp semantic relation distillation to distill consistent inter-class +semantic relations via soft labels for alleviating catastrophic forgetting from +the semantic aspect. In addition, we develop a prototypical pseudo re-labeling +that provides strong semantic guidance to mitigate background shift. It +produces high-quality pseudo labels for old classes in the background by +measuring distances between pixels and class-wise prototypes. Extensive +experiments on three public datasets, i.e., Pascal VOC 2012, ADE20K, and +Cityscapes, demonstrate the effectiveness of our proposed GSC model. + +
+
+
+
+
+ + ☆ BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained + Diffusion ICCV 2023 + + +
+ Recent text-to-image diffusion models have demonstrated an astonishing +capacity to generate high-quality images. However, researchers mainly studied +the way of synthesizing images with only text prompts. While some works have +explored using other modalities as conditions, considerable paired data, e.g., +box/mask-image pairs, and fine-tuning time are required for nurturing models. +As such paired data is time-consuming and labor-intensive to acquire and +restricted to a closed set, this potentially becomes the bottleneck for +applications in an open world. This paper focuses on the simplest form of +user-provided conditions, e.g., box or scribble. To mitigate the aforementioned +problem, we propose a training-free method to control objects and contexts in +the synthesized images adhering to the given spatial conditions. Specifically, +three spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints, +are designed and seamlessly integrated into the denoising step of diffusion +models, requiring no additional training and massive annotated layout data. +Extensive results show that the proposed constraints can control what and where +to present in the images while retaining the ability of the Stable Diffusion +model to synthesize with high fidelity and diverse concept coverage. The code +is publicly available at https://github.com/Sierkinhane/BoxDiff. + +
+
+ comment: Accepted by ICCV 2023. The paper is still being revised for better + organization and comparison +
+
+
+
+
+ + ☆ Perceptual Quality Assessment of Omnidirectional Audio-visual Signals + + +
+ Omnidirectional videos (ODVs) play an increasingly important role in the +application fields of medical, education, advertising, tourism, etc. Assessing +the quality of ODVs is significant for service-providers to improve the user's +Quality of Experience (QoE). However, most existing quality assessment studies +for ODVs only focus on the visual distortions of videos, while ignoring that +the overall QoE also depends on the accompanying audio signals. In this paper, +we first establish a large-scale audio-visual quality assessment dataset for +omnidirectional videos, which includes 375 distorted omnidirectional +audio-visual (A/V) sequences generated from 15 high-quality pristine +omnidirectional A/V contents, and the corresponding perceptual audio-visual +quality scores. Then, we design three baseline methods for full-reference +omnidirectional audio-visual quality assessment (OAVQA), which combine existing +state-of-the-art single-mode audio and video QA models via multimodal fusion +strategies. We validate the effectiveness of the A/V multimodal fusion method +for OAVQA on our dataset, which provides a new benchmark for omnidirectional +QoE evaluation. Our dataset is available at https://github.com/iamazxl/OAVQA. + +
+
+ comment: 12 pages, 5 figures, to be published in CICAI2023 +
+
+
+
+
+ + ☆ Meta-Transformer: A Unified Framework for Multimodal Learning + + +
+ Multimodal learning aims to build models that can process and relate +information from multiple modalities. Despite years of development in this +field, it still remains challenging to design a unified network for processing +various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point +clouds, audio, video, time series, tabular data) due to the inherent gaps among +them. In this work, we propose a framework, named Meta-Transformer, that +leverages a $\textbf{frozen}$ encoder to perform multimodal perception without +any paired multimodal training data. In Meta-Transformer, the raw input data +from various modalities are mapped into a shared token space, allowing a +subsequent encoder with frozen parameters to extract high-level semantic +features of the input data. Composed of three main components: a unified data +tokenizer, a modality-shared encoder, and task-specific heads for downstream +tasks, Meta-Transformer is the first framework to perform unified learning +across 12 modalities with unpaired data. Experiments on different benchmarks +reveal that Meta-Transformer can handle a wide range of tasks including +fundamental perception (text, image, point cloud, audio, video), practical +application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph, +tabular, and time-series). Meta-Transformer indicates a promising future for +developing unified multimodal intelligence with transformers. Code will be +available at https://github.com/invictus717/MetaTransformer + +
+
+ comment: Project website: https://kxgong.github.io/meta_transformer/ +
+
+
+
+
+ + ☆ HyperReenact: One-Shot Reenactment via Jointly Learning to Refine and + Retarget Faces ICCV 2023 + + +
+ In this paper, we present our method for neural face reenactment, called +HyperReenact, that aims to generate realistic talking head images of a source +identity, driven by a target facial pose. Existing state-of-the-art face +reenactment methods train controllable generative models that learn to +synthesize realistic facial images, yet producing reenacted faces that are +prone to significant visual artifacts, especially under the challenging +condition of extreme head pose changes, or requiring expensive few-shot +fine-tuning to better preserve the source identity characteristics. We propose +to address these limitations by leveraging the photorealistic generation +ability and the disentangled properties of a pretrained StyleGAN2 generator, by +first inverting the real images into its latent space and then using a +hypernetwork to perform: (i) refinement of the source identity characteristics +and (ii) facial pose re-targeting, eliminating this way the dependence on +external editing methods that typically produce artifacts. Our method operates +under the one-shot setting (i.e., using a single source frame) and allows for +cross-subject reenactment, without requiring any subject-specific fine-tuning. +We compare our method both quantitatively and qualitatively against several +state-of-the-art techniques on the standard benchmarks of VoxCeleb1 and +VoxCeleb2, demonstrating the superiority of our approach in producing +artifact-free images, exhibiting remarkable robustness even under extreme head +pose changes. We make the code and the pretrained models publicly available at: +https://github.com/StelaBou/HyperReenact . + +
+
+ comment: Accepted for publication in ICCV 2023. Project page: + https://stelabou.github.io/hyperreenact.github.io/ Code: + https://github.com/StelaBou/HyperReenact +
+
+
+
+
+ + ☆ Optimizing PatchCore for Few/many-shot Anomaly Detection + + +
+ Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and +tries to distinguish between normal and anomalous data using only few selected +samples. While newly proposed few-shot AD methods do compare against +pre-existing algorithms developed for the full-shot domain as baselines, they +do not dedicatedly optimize them for the few-shot setting. It thus remains +unclear if the performance of such pre-existing algorithms can be further +improved. We address said question in this work. Specifically, we present a +study on the AD/anomaly segmentation (AS) performance of PatchCore, the current +state-of-the-art full-shot AD/AS algorithm, in both the few-shot and the +many-shot settings. We hypothesize that further performance improvements can be +realized by (I) optimizing its various hyperparameters, and by (II) +transferring techniques known to improve few-shot supervised learning to the AD +domain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal +that (I) significant performance improvements can be realized by optimizing +hyperparameters such as the underlying feature extractor, and that (II) +image-level augmentations can, but are not guaranteed, to improve performance. +Based on these findings, we achieve a new state of the art in few-shot AD on +VisA, further demonstrating the merit of adapting pre-existing AD/AS methods to +the few-shot setting. Last, we identify the investigation of feature extractors +with a strong inductive bias as a potential future research direction for +(few-shot) AD/AS. + +
+
+
+
+
+ + ☆ Behavioral Analysis of Vision-and-Language Navigation Agents CVPR2023 + + +
+ To be successful, Vision-and-Language Navigation (VLN) agents must be able to +ground instructions to actions based on their surroundings. In this work, we +develop a methodology to study agent behavior on a skill-specific basis -- +examining how well existing agents ground instructions about stopping, turning, +and moving towards specified objects or rooms. Our approach is based on +generating skill-specific interventions and measuring changes in agent +predictions. We present a detailed case study analyzing the behavior of a +recent agent and then compare multiple agents in terms of skill-specific +competency scores. This analysis suggests that biases from training have +lasting effects on agent behavior and that existing models are able to ground +simple referring expressions. Our comparisons between models show that +skill-specific scores correlate with improvements in overall VLN task +performance. + +
+
+ comment: accepted to CVPR2023 +
+
+
+
+
+ + ☆ Feed-Forward Source-Free Domain Adaptation via Class Prototypes ECCV 2022 + + +
+ Source-free domain adaptation has become popular because of its practical +usefulness and no need to access source data. However, the adaptation process +still takes a considerable amount of time and is predominantly based on +optimization that relies on back-propagation. In this work we present a simple +feed-forward approach that challenges the need for back-propagation based +adaptation. Our approach is based on computing prototypes of classes under the +domain shift using a pre-trained model. It achieves strong improvements in +accuracy compared to the pre-trained model and requires only a small fraction +of time of existing domain adaptation methods. + +
+
+ comment: ECCV 2022 Workshop on Out of Distribution Generalization in Computer + Vision (OOD-CV) +
+
+
+
+
+ + ☆ SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with + 4D Imaging Radar + + +
+ The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle +sensing due to its cost-effectiveness and operability in adverse weather +conditions. However, the adoption of this technology has been hindered by +sparsity and noise issues in radar point cloud data. This paper introduces +spatial multi-representation fusion (SMURF), a novel approach to 3D object +detection using a single 4D imaging radar. SMURF leverages multiple +representations of radar detection points, including pillarization and density +features of a multi-dimensional Gaussian mixture distribution through kernel +density estimation (KDE). KDE effectively mitigates measurement inaccuracy +caused by limited angular resolution and multi-path propagation of radar +signals. Additionally, KDE helps alleviate point cloud sparsity by capturing +density features. Experimental evaluations on View-of-Delft (VoD) and +TJ4DRadSet datasets demonstrate the effectiveness and generalization ability of +SMURF, outperforming recently proposed 4D imaging radar-based +single-representation models. Moreover, while using 4D imaging radar only, +SMURF still achieves comparable performance to the state-of-the-art 4D imaging +radar and camera fusion-based method, with an increase of 1.22% in the mean +average precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D +mean average precision on the entire annotated area of VoD dataset. Our +proposed method demonstrates impressive inference time and addresses the +challenges of real-time detection, with the inference time no more than 0.05 +seconds for most scans on both datasets. This research highlights the benefits +of 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D +object detection with 4D imaging radar. + +
+
+
+
+
+ + ☆ See More and Know More: Zero-shot Point Cloud Segmentation via + Multi-modal Visual Data ICCV 2023 + + +
+ Zero-shot point cloud segmentation aims to make deep models capable of +recognizing novel objects in point cloud that are unseen in the training phase. +Recent trends favor the pipeline which transfers knowledge from seen classes +with labels to unseen classes without labels. They typically align visual +features with semantic features obtained from word embedding by the supervision +of seen classes' annotations. However, point cloud contains limited information +to fully match with semantic features. In fact, the rich appearance information +of images is a natural complement to the textureless point cloud, which is not +well explored in previous literature. Motivated by this, we propose a novel +multi-modal zero-shot learning method to better utilize the complementary +information of point clouds and images for more accurate visual-semantic +alignment. Extensive experiments are performed in two popular benchmarks, i.e., +SemanticKITTI and nuScenes, and our method outperforms current SOTA methods +with 52% and 49% improvement on average for unseen class mIoU, respectively. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Learned Thresholds Token Merging and Pruning for Vision Transformers ICML + + +
+ Vision transformers have demonstrated remarkable success in a wide range of +computer vision tasks over the last years. However, their high computational +costs remain a significant barrier to their practical deployment. In +particular, the complexity of transformer models is quadratic with respect to +the number of input tokens. Therefore techniques that reduce the number of +input tokens that need to be processed have been proposed. This paper +introduces Learned Thresholds token Merging and Pruning (LTMP), a novel +approach that leverages the strengths of both token merging and token pruning. +LTMP uses learned threshold masking modules that dynamically determine which +tokens to merge and which to prune. We demonstrate our approach with extensive +experiments on vision transformers on the ImageNet classification task. Our +results demonstrate that LTMP achieves state-of-the-art accuracy across +reduction rates while requiring only a single fine-tuning epoch, which is an +order of magnitude faster than previous methods. Code is available at +https://github.com/Mxbonn/ltmp . + +
+
+ comment: Paper to be presented at Efficient Systems for Foundation Models + Workshop at the International Conference on Machine Learning (ICML) 2023 +
+
+
+
+
+ + ☆ Urban Radiance Field Representation with Deformable Neural Mesh + Primitives ICCV2023 + + +
+ Neural Radiance Fields (NeRFs) have achieved great success in the past few +years. However, most current methods still require intensive resources due to +ray marching-based rendering. To construct urban-level radiance fields +efficiently, we design Deformable Neural Mesh Primitive~(DNMP), and propose to +parameterize the entire scene with such primitives. The DNMP is a flexible and +compact neural variant of classic mesh representation, which enjoys both the +efficiency of rasterization-based rendering and the powerful neural +representation capability for photo-realistic image synthesis. Specifically, a +DNMP consists of a set of connected deformable mesh vertices with paired vertex +features to parameterize the geometry and radiance information of a local area. +To constrain the degree of freedom for optimization and lower the storage +budgets, we enforce the shape of each primitive to be decoded from a relatively +low-dimensional latent space. The rendering colors are decoded from the vertex +features (interpolated with rasterization) by a view-dependent MLP. The DNMP +provides a new paradigm for urban-level scene representation with appealing +properties: $(1)$ High-quality rendering. Our method achieves leading +performance for novel view synthesis in urban scenarios. $(2)$ Low +computational costs. Our representation enables fast rendering (2.07ms/1k +pixels) and low peak memory usage (110MB/1k pixels). We also present a +lightweight version that can run 33$\times$ faster than vanilla NeRFs, and +comparable to the highly-optimized Instant-NGP (0.61 vs 0.71ms/1k pixels). +Project page: \href{https://dnmp.github.io/}{https://dnmp.github.io/}. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of + Working Memory + + +
+ Working memory (WM), a fundamental cognitive process facilitating the +temporary storage, integration, manipulation, and retrieval of information, +plays a vital role in reasoning and decision-making tasks. Robust benchmark +datasets that capture the multifaceted nature of WM are crucial for the +effective development and evaluation of AI WM models. Here, we introduce a +comprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM +comprises 10 tasks and a total of 1 million trials, assessing 4 +functionalities, 3 domains, and 11 behavioral and neural characteristics of WM. +We jointly trained and tested state-of-the-art recurrent neural networks and +transformers on all these tasks. We also include human behavioral benchmarks as +an upper bound for comparison. Our results suggest that AI models replicate +some characteristics of WM in the brain, most notably primacy and recency +effects, and neural clusters and correlates specialized for different domains +and functionalities of WM. In the experiments, we also reveal some limitations +in existing models to approximate human behavior. This dataset serves as a +valuable resource for communities in cognitive psychology, neuroscience, and +AI, offering a standardized framework to compare and enhance WM models, +investigate WM's neural underpinnings, and develop WM models with human-like +capabilities. Our source code and data are available at +https://github.com/ZhangLab-DeepNeuroCogLab/WorM. + +
+
+
+
+
+ + ☆ MSQNet: Actor-agnostic Action Recognition with Multi-modal Query + + +
+ Existing action recognition methods are typically actor-specific due to the +intrinsic topological and apparent differences among the actors. This requires +actor-specific pose estimation (e.g., humans vs. animals), leading to +cumbersome model design complexity and high maintenance costs. Moreover, they +often focus on learning the visual modality alone and single-label +classification whilst neglecting other available information sources (e.g., +class name text) and the concurrent occurrence of multiple actions. To overcome +these limitations, we propose a new approach called 'actor-agnostic multi-modal +multi-label action recognition,' which offers a unified solution for various +types of actors, including humans and animals. We further formulate a novel +Multi-modal Semantic Query Network (MSQNet) model in a transformer-based object +detection framework (e.g., DETR), characterized by leveraging visual and +textual modalities to represent the action classes better. The elimination of +actor-specific model designs is a key advantage, as it removes the need for +actor pose estimation altogether. Extensive experiments on five publicly +available benchmarks show that our MSQNet consistently outperforms the prior +arts of actor-specific alternatives on human and animal single- and multi-label +action recognition tasks by up to 50%. Code will be released at +https://github.com/mondalanindya/MSQNet. + +
+
+
+
+
+ + ☆ LBL: Logarithmic Barrier Loss Function for One-class Classification + + +
+ One-class classification (OCC) aims to train a classifier only with the +target class data and attracts great attention for its strong applicability in +real-world application. Despite a lot of advances have been made in OCC, it +still lacks the effective OCC loss functions for deep learning. In this paper, +a novel logarithmic barrier function based OCC loss (LBL) that assigns large +gradients to the margin samples and thus derives more compact hypersphere, is +first proposed by approximating the OCC objective smoothly. But the +optimization of LBL may be instability especially when samples lie on the +boundary leading to the infinity loss. To address this issue, then, a +unilateral relaxation Sigmoid function is introduced into LBL and a novel OCC +loss named LBLSig is proposed. The LBLSig can be seen as the fusion of the mean +square error (MSE) and the cross entropy (CE) and the optimization of LBLSig is +smoother owing to the unilateral relaxation Sigmoid function. The effectiveness +of the proposed LBL and LBLSig is experimentally demonstrated in comparisons +with several state-of-the-art OCC algorithms on different network structures. +The source code can be found at https://github.com/ML-HDU/LBL_LBLSig. + +
+
+
+
+
+ + ☆ EdgeAL: An Edge Estimation Based Active Learning Approach for OCT + Segmentation + + +
+ Active learning algorithms have become increasingly popular for training +models with limited data. However, selecting data for annotation remains a +challenging problem due to the limited information available on unseen data. To +address this issue, we propose EdgeAL, which utilizes the edge information of +unseen images as {\it a priori} information for measuring uncertainty. The +uncertainty is quantified by analyzing the divergence and entropy in model +predictions across edges. This measure is then used to select superpixels for +annotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical +Coherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice +score while reducing the annotation label cost to 12%, 2.3%, and 3%, +respectively, on three publicly available datasets (Duke, AROI, and UMN). The +source code is available at \url{https://github.com/Mak-Ta-Reque/EdgeAL} + +
+
+ comment: This version of the contribution has been accepted for publication, + after peer review (when applicable) but is not the Version of Record and does + not reflect post-acceptance improvements, or any corrections. Use of this + Accepted Version is subject to the publisher's Accepted Manuscript terms of + use + https://www.springernature.com/gp/open-research/policies/accepted-manuscript-terms +
+
+
+
+
+ + ☆ Kick Back & Relax: Learning to Reconstruct the World by Watching SlowTV ICCV2023 + + +
+ Self-supervised monocular depth estimation (SS-MDE) has the potential to +scale to vast quantities of data. Unfortunately, existing approaches limit +themselves to the automotive domain, resulting in models incapable of +generalizing to complex environments such as natural or indoor settings. + To address this, we propose a large-scale SlowTV dataset curated from +YouTube, containing an order of magnitude more data than existing automotive +datasets. SlowTV contains 1.7M images from a rich diversity of environments, +such as worldwide seasonal hiking, scenic driving and scuba diving. Using this +dataset, we train an SS-MDE model that provides zero-shot generalization to a +large collection of indoor/outdoor datasets. The resulting model outperforms +all existing SSL approaches and closes the gap on supervised SoTA, despite +using a more efficient architecture. + We additionally introduce a collection of best-practices to further maximize +performance and zero-shot generalization. This includes 1) aspect ratio +augmentation, 2) camera intrinsic estimation, 3) support frame randomization +and 4) flexible motion estimation. Code is available at +https://github.com/jspenmar/slowtv_monodepth. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of + Diffusion Probabilistic Models + + +
+ Existing customization methods require access to multiple reference examples +to align pre-trained diffusion probabilistic models (DPMs) with user-provided +concepts. This paper aims to address the challenge of DPM customization when +the only available supervision is a differentiable metric defined on the +generated contents. Since the sampling procedure of DPMs involves recursive +calls to the denoising UNet, na\"ive gradient backpropagation requires storing +the intermediate states of all iterations, resulting in extremely high memory +consumption. To overcome this issue, we propose a novel method AdjointDPM, +which first generates new samples from diffusion models by solving the +corresponding probability-flow ODEs. It then uses the adjoint sensitivity +method to backpropagate the gradients of the loss to the models' parameters +(including conditioning signals, network weights, and initial noises) by +solving another augmented ODE. To reduce numerical errors in both the forward +generation and gradient backpropagation processes, we further reparameterize +the probability-flow ODE and augmented ODE as simple non-stiff ODEs using +exponential integration. Finally, we demonstrate the effectiveness of +AdjointDPM on three interesting tasks: converting visual effects into +identification text embeddings, finetuning DPMs for specific types of +stylization, and optimizing initial noise to generate adversarial samples for +security auditing. + +
+
+
+
+
+ + ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and + Lane Segmentation in Self-Driving Cars + + +
+ Semantic segmentation is a common task in autonomous driving to understand +the surrounding environment. Driveable Area Segmentation and Lane Detection are +particularly important for safe and efficient navigation on the road. However, +original semantic segmentation models are computationally expensive and require +high-end hardware, which is not feasible for embedded systems in autonomous +vehicles. This paper proposes a lightweight model for the driveable area and +lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate +and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K +dataset and compare it with modern models. Experimental results show that our +TwinLiteNet performs similarly to existing approaches, requiring significantly +fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score +of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task +with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000. +Furthermore, TwinLiteNet can run in real-time on embedded devices with limited +computing power, especially since it achieves 60FPS on Jetson Xavier NX, making +it an ideal solution for self-driving vehicles. Code is available: +url{https://github.com/chequanghuy/TwinLiteNet}. + +
+
+
+
+
+ + ☆ Reverse Knowledge Distillation: Training a Large Model using a Small One + for Retinal Image Matching on Limited Data + + +
+ Retinal image matching plays a crucial role in monitoring disease progression +and treatment response. However, datasets with matched keypoints between +temporally separated pairs of images are not available in abundance to train +transformer-based model. We propose a novel approach based on reverse knowledge +distillation to train large models with limited data while preventing +overfitting. Firstly, we propose architectural modifications to a CNN-based +semi-supervised method called SuperRetina that help us improve its results on a +publicly available dataset. Then, we train a computationally heavier model +based on a vision transformer encoder using the lighter CNN-based model, which +is counter-intuitive in the field knowledge-distillation research where +training lighter models based on heavier ones is the norm. Surprisingly, such +reverse knowledge distillation improves generalization even further. Our +experiments suggest that high-dimensional fitting in representation space may +prevent overfitting unlike training directly to match the final output. We also +provide a public dataset with annotations for retinal image keypoint detection +and matching to help the research community develop algorithms for retinal +image applications. + +
+
+
+
+
+ + ☆ SqueezerFaceNet: Reducing a Small Face Recognition CNN Even More Via + Filter Pruning + + +
+ The widespread use of mobile devices for various digital services has created +a need for reliable and real-time person authentication. In this context, +facial recognition technologies have emerged as a dependable method for +verifying users due to the prevalence of cameras in mobile devices and their +integration into everyday applications. The rapid advancement of deep +Convolutional Neural Networks (CNNs) has led to numerous face verification +architectures. However, these models are often large and impractical for mobile +applications, reaching sizes of hundreds of megabytes with millions of +parameters. We address this issue by developing SqueezerFaceNet, a light face +recognition network which less than 1M parameters. This is achieved by applying +a network pruning method based on Taylor scores, where filters with small +importance scores are removed iteratively. Starting from an already small +network (of 1.24M) based on SqueezeNet, we show that it can be further reduced +(up to 40%) without an appreciable loss in performance. To the best of our +knowledge, we are the first to evaluate network pruning methods for the task of +face recognition. + +
+
+ comment: Published at VIII International Workshop on Artificial Intelligence + and Pattern Recognition, IWAIPR 2023 +
+
+
+
+
+ + ☆ SLPD: Slide-level Prototypical Distillation for WSIs MICCAI + + +
+ Improving the feature representation ability is the foundation of many whole +slide pathological image (WSIs) tasks. Recent works have achieved great success +in pathological-specific self-supervised learning (SSL). However, most of them +only focus on learning patch-level representations, thus there is still a gap +between pretext and slide-level downstream tasks, e.g., subtyping, grading and +staging. Aiming towards slide-level representations, we propose Slide-Level +Prototypical Distillation (SLPD) to explore intra- and inter-slide semantic +structures for context modeling on WSIs. Specifically, we iteratively perform +intra-slide clustering for the regions (4096x4096 patches) within each WSI to +yield the prototypes and encourage the region representations to be closer to +the assigned prototypes. By representing each slide with its prototypes, we +further select similar slides by the set distance of prototypes and assign the +regions by cross-slide prototypes for distillation. SLPD achieves +state-of-the-art results on multiple slide-level benchmarks and demonstrates +that representation learning of semantic structures of slides can make a +suitable proxy task for WSI analysis. Code will be available at +https://github.com/Carboxy/SLPD. + +
+
+ comment: International Conference on Medical Image Computing and Computer + Assisted Intervention (MICCAI) +
+
+
+
+
+ + ☆ Self2Self+: Single-Image Denoising with Self-Supervised Learning and + Image Quality Assessment Loss + + +
+ Recently, denoising methods based on supervised learning have exhibited +promising performance. However, their reliance on external datasets containing +noisy-clean image pairs restricts their applicability. To address this +limitation, researchers have focused on training denoising networks using +solely a set of noisy inputs. To improve the feasibility of denoising +procedures, in this study, we proposed a single-image self-supervised learning +method in which only the noisy input image is used for network training. Gated +convolution was used for feature extraction and no-reference image quality +assessment was used for guiding the training process. Moreover, the proposed +method sampled instances from the input image dataset using Bernoulli sampling +with a certain dropout rate for training. The corresponding result was produced +by averaging the generated predictions from various instances of the trained +network with dropouts. The experimental results indicated that the proposed +method achieved state-of-the-art denoising performance on both synthetic and +real-world datasets. This highlights the effectiveness and practicality of our +method as a potential solution for various noise removal tasks. + +
+
+ comment: Technical report and supplemantry materials are combined into one + paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18 +
+
+
+
+
+ + ☆ Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged + Object Detection + + +
+ Camouflaged object detection (COD), aiming to segment camouflaged objects +which exhibit similar patterns with the background, is a challenging task. Most +existing works are dedicated to establishing specialized modules to identify +camouflaged objects with complete and fine details, while the boundary can not +be well located for the lack of object-related semantics. In this paper, we +propose a novel ``pre-train, adapt and detect" paradigm to detect camouflaged +objects. By introducing a large pre-trained model, abundant knowledge learned +from massive multi-modal data can be directly transferred to COD. A lightweight +parallel adapter is inserted to adjust the features suitable for the downstream +COD task. Extensive experiments on four challenging benchmark datasets +demonstrate that our method outperforms existing state-of-the-art COD models by +large margins. Moreover, we design a multi-task learning scheme for tuning the +adapter to exploit the shareable knowledge across different semantic classes. +Comprehensive experimental results showed that the generalization ability of +our model can be substantially improved with multi-task adapter initialization +on source tasks and multi-task adaptation on target tasks. + +
+
+
+
+
+ + ☆ Deep learning for classification of noisy QR codes + + +
+ We wish to define the limits of a classical classification model based on +deep learning when applied to abstract images, which do not represent visually +identifiable objects.QR codes (Quick Response codes) fall into this category of +abstract images: one bit corresponding to one encoded character, QR codes were +not designed to be decoded manually. To understand the limitations of a deep +learning-based model for abstract image classification, we train an image +classification model on QR codes generated from information obtained when +reading a health pass. We compare a classification model with a classical +(deterministic) decoding method in the presence of noise. This study allows us +to conclude that a model based on deep learning can be relevant for the +understanding of abstract images. + +
+
+ comment: in French language. RFIAP 2022 - Reconnaissance des Formes, Image, + Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France +
+
+
+
+
+ + ☆ Efficient Unified Demosaicing for Bayer and Non-Bayer Patterned Image + Sensors + + +
+ As the physical size of recent CMOS image sensors (CIS) gets smaller, the +latest mobile cameras are adopting unique non-Bayer color filter array (CFA) +patterns (e.g., Quad, Nona, QxQ), which consist of homogeneous color units with +adjacent pixels. These non-Bayer sensors are superior to conventional Bayer CFA +thanks to their changeable pixel-bin sizes for different light conditions but +may introduce visual artifacts during demosaicing due to their inherent pixel +pattern structures and sensor hardware characteristics. Previous demosaicing +methods have primarily focused on Bayer CFA, necessitating distinct +reconstruction methods for non-Bayer patterned CIS with various CFA modes under +different lighting conditions. In this work, we propose an efficient unified +demosaicing method that can be applied to both conventional Bayer RAW and +various non-Bayer CFAs' RAW data in different operation modes. Our Knowledge +Learning-based demosaicing model for Adaptive Patterns, namely KLAP, utilizes +CFA-adaptive filters for only 1% key filters in the network for each CFA, but +still manages to effectively demosaic all the CFAs, yielding comparable +performance to the large-scale models. Furthermore, by employing meta-learning +during inference (KLAP-M), our model is able to eliminate unknown +sensor-generic artifacts in real RAW data, effectively bridging the gap between +synthetic images and real sensor RAW. Our KLAP and KLAP-M methods achieved +state-of-the-art demosaicing performance in both synthetic and real RAW data of +Bayer and non-Bayer CFAs. + +
+
+
+
+
+ + ☆ Lighting up NeRF via Unsupervised Decomposition and Enhancement ICCV 2023 + + +
+ Neural Radiance Field (NeRF) is a promising approach for synthesizing novel +views, given a set of images and the corresponding camera poses of a scene. +However, images photographed from a low-light scene can hardly be used to train +a NeRF model to produce high-quality results, due to their low pixel +intensities, heavy noise, and color distortion. Combining existing low-light +image enhancement methods with NeRF methods also does not work well due to the +view inconsistency caused by the individual 2D enhancement process. In this +paper, we propose a novel approach, called Low-Light NeRF (or LLNeRF), to +enhance the scene representation and synthesize normal-light novel views +directly from sRGB low-light images in an unsupervised manner. The core of our +approach is a decomposition of radiance field learning, which allows us to +enhance the illumination, reduce noise and correct the distorted colors jointly +with the NeRF optimization process. Our method is able to produce novel view +images with proper lighting and vivid colors and details, given a collection of +camera-finished low dynamic range (8-bits/channel) images from a low-light +scene. Experiments demonstrate that our method outperforms existing low-light +enhancement methods and NeRF methods. + +
+
+ comment: ICCV 2023. Project website: https://whyy.site/paper/llnerf +
+
+
+
+
+ + ☆ RetouchingFFHQ: A Large-scale Dataset for Fine-grained Face Retouching + Detection + + +
+ The widespread use of face retouching filters on short-video platforms has +raised concerns about the authenticity of digital appearances and the impact of +deceptive advertising. To address these issues, there is a pressing need to +develop advanced face retouching techniques. However, the lack of large-scale +and fine-grained face retouching datasets has been a major obstacle to progress +in this field. In this paper, we introduce RetouchingFFHQ, a large-scale and +fine-grained face retouching dataset that contains over half a million +conditionally-retouched images. RetouchingFFHQ stands out from previous +datasets due to its large scale, high quality, fine-grainedness, and +customization. By including four typical types of face retouching operations +and different retouching levels, we extend the binary face retouching detection +into a fine-grained, multi-retouching type, and multi-retouching level +estimation problem. Additionally, we propose a Multi-granularity Attention +Module (MAM) as a plugin for CNN backbones for enhanced cross-scale +representation learning. Extensive experiments using different baselines as +well as our proposed method on RetouchingFFHQ show decent performance on face +retouching detection. With the proposed new dataset, we believe there is great +potential for future work to tackle the challenging problem of real-world +fine-grained face retouching detection. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Quantized Feature Distillation for Network Quantization AAAI2023 + + +
+ Neural network quantization aims to accelerate and trim full-precision neural +network models by using low bit approximations. Methods adopting the +quantization aware training (QAT) paradigm have recently seen a rapid growth, +but are often conceptually complicated. This paper proposes a novel and highly +effective QAT method, quantized feature distillation (QFD). QFD first trains a +quantized (or binarized) representation as the teacher, then quantize the +network using knowledge distillation (KD). Quantitative results show that QFD +is more flexible and effective (i.e., quantization friendly) than previous +quantization methods. QFD surpasses existing methods by a noticeable margin on +not only image classification but also object detection, albeit being much +simpler. Furthermore, QFD quantizes ViT and Swin-Transformer on MS-COCO +detection and segmentation, which verifies its potential in real world +deployment. To the best of our knowledge, this is the first time that vision +transformers have been quantized in object detection and image segmentation +tasks. + +
+
+ comment: AAAI2023 +
+
+
+
+
+ + ☆ Learning and Evaluating Human Preferences for Conversational Head + Generation + + +
+ A reliable and comprehensive evaluation metric that aligns with manual +preference assessments is crucial for conversational head video synthesis +method development. Existing quantitative evaluations often fail to capture the +full complexity of human preference, as they only consider limited evaluation +dimensions. Qualitative evaluations and user studies offer a solution but are +time-consuming and labor-intensive. This limitation hinders the advancement of +conversational head generation algorithms and systems. In this paper, we +propose a novel learning-based evaluation metric named Preference Score (PS) +for fitting human preference according to the quantitative evaluations across +different dimensions. PS can serve as a quantitative evaluation without the +need for human annotation. Experimental results validate the superiority of +Preference Score in aligning with human perception, and also demonstrates +robustness and generalizability to unseen data, making it a valuable tool for +advancing conversation head generation. We expect this metric could facilitate +new advances in conversational head generation. + +
+
+
+
+
+ + ☆ Parallelization of a new embedded application for automatic meteor + detection + + +
+ This article presents the methods used to parallelize a new computer vision +application. The system is able to automatically detect meteor from +non-stabilized cameras and noisy video sequences. The application is designed +to be embedded in weather balloons or for airborne observation campaigns. Thus, +the final target is a low power system-on-chip (< 10 Watts) while the software +needs to compute a stream of frames in real-time (> 25 frames per second). For +this, first the application is split in a tasks graph, then different +parallelization techniques are applied. Experiment results demonstrate the +efficiency of the parallelization methods. For instance, on the Raspberry Pi 4 +and on a HD video sequence, the processing chain reaches 42 frames per second +while it only consumes 6 Watts. + +
+
+ comment: in French language, COMPAS 2023 - Conf{\'e}rence francophone + d'informatique en Parall{\'e}lisme, Architecture et Syst{\`e}me, Jul 2023, + Annecy (France), France +
+
+
+
+
+ + ☆ Learning Discriminative Visual-Text Representation for Polyp + Re-Identification + + +
+ Colonoscopic Polyp Re-Identification aims to match a specific polyp in a +large gallery with different cameras and views, which plays a key role for the +prevention and treatment of colorectal cancer in the computer-aided diagnosis. +However, traditional methods mainly focus on the visual representation +learning, while neglect to explore the potential of semantic features during +training, which may easily leads to poor generalization capability when adapted +the pretrained model into the new scenarios. To relieve this dilemma, we +propose a simple but effective training method named VT-ReID, which can +remarkably enrich the representation of polyp videos with the interchange of +high-level semantic information. Moreover, we elaborately design a novel +clustering mechanism to introduce prior knowledge from textual data, which +leverages contrastive learning to promote better separation from abundant +unlabeled text data. To the best of our knowledge, this is the first attempt to +employ the visual-text feature with clustering mechanism for the colonoscopic +polyp re-identification. Empirical results show that our method significantly +outperforms current state-of-the art methods with a clear margin. + +
+
+
+
+
+ + ☆ Joint Skeletal and Semantic Embedding Loss for Micro-gesture + Classification IJCAI-2023 + + +
+ In this paper, we briefly introduce the solution of our team HFUT-VUT for the +Micros-gesture Classification in the MiGA challenge at IJCAI 2023. The +micro-gesture classification task aims at recognizing the action category of a +given video based on the skeleton data. For this task, we propose a +3D-CNNs-based micro-gesture recognition network, which incorporates a skeletal +and semantic embedding loss to improve action classification performance. +Finally, we rank 1st in the Micro-gesture Classification Challenge, surpassing +the second-place team in terms of Top-1 accuracy by 1.10%. + +
+
+ comment: 1st Place in Micro-gesture Classification sub-challenge in MiGA at + IJCAI-2023 +
+
+
+
+
+ + ☆ Quaternion tensor ring decomposition and application for color image + inpainting + + +
+ In recent years, tensor networks have emerged as powerful tools for solving +large-scale optimization problems. One of the most promising tensor networks is +the tensor ring (TR) decomposition, which achieves circular dimensional +permutation invariance in the model through the utilization of the trace +operation and equitable treatment of the latent cores. On the other hand, more +recently, quaternions have gained significant attention and have been widely +utilized in color image processing tasks due to their effectiveness in encoding +color pixels. Therefore, in this paper, we propose the quaternion tensor ring +(QTR) decomposition, which inherits the powerful and generalized representation +abilities of the TR decomposition while leveraging the advantages of +quaternions for color pixel representation. In addition to providing the +definition of QTR decomposition and an algorithm for learning the QTR format, +this paper also proposes a low-rank quaternion tensor completion (LRQTC) model +and its algorithm for color image inpainting based on the QTR decomposition. +Finally, extensive experiments on color image inpainting demonstrate that the +proposed QTLRC method is highly competitive. + +
+
+
+
+
+ + ☆ Heterogeneous Federated Learning: State-of-the-art and Research + Challenges + + +
+ Federated learning (FL) has drawn increasing attention owing to its potential +use in large-scale industrial applications. Existing federated learning works +mainly focus on model homogeneous settings. However, practical federated +learning typically faces the heterogeneity of data distributions, model +architectures, network environments, and hardware devices among participant +clients. Heterogeneous Federated Learning (HFL) is much more challenging, and +corresponding solutions are diverse and complex. Therefore, a systematic survey +on this topic about the research challenges and state-of-the-art is essential. +In this survey, we firstly summarize the various research challenges in HFL +from five aspects: statistical heterogeneity, model heterogeneity, +communication heterogeneity, device heterogeneity, and additional challenges. +In addition, recent advances in HFL are reviewed and a new taxonomy of existing +HFL methods is proposed with an in-depth analysis of their pros and cons. We +classify existing methods from three different levels according to the HFL +procedure: data-level, model-level, and server-level. Finally, several critical +and promising future research directions in HFL are discussed, which may +facilitate further developments in this field. A periodically updated +collection on HFL is available at https://github.com/marswhu/HFL_Survey. + +
+
+ comment: 42 pages, 11 figures, and 4 tables +
+
+
+
+
+ + ☆ Hybrid Feature Embedding For Automatic Building Outline Extraction + + +
+ Building outline extracted from high-resolution aerial images can be used in +various application fields such as change detection and disaster assessment. +However, traditional CNN model cannot recognize contours very precisely from +original images. In this paper, we proposed a CNN and Transformer based model +together with active contour model to deal with this problem. We also designed +a triple-branch decoder structure to handle different features generated by +encoder. Experiment results show that our model outperforms other baseline +model on two datasets, achieving 91.1% mIoU on Vaihingen and 83.8% on Bing +huts. + +
+
+
+
+
+ + ☆ Physics-Driven Turbulence Image Restoration with Stochastic Refinement ICCV 2023 + + +
+ Image distortion by atmospheric turbulence is a stochastic degradation, which +is a critical problem in long-range optical imaging systems. A number of +research has been conducted during the past decades, including model-based and +emerging deep-learning solutions with the help of synthetic data. Although fast +and physics-grounded simulation tools have been introduced to help the +deep-learning models adapt to real-world turbulence conditions recently, the +training of such models only relies on the synthetic data and ground truth +pairs. This paper proposes the Physics-integrated Restoration Network (PiRN) to +bring the physics-based simulator directly into the training process to help +the network to disentangle the stochasticity from the degradation and the +underlying image. Furthermore, to overcome the ``average effect" introduced by +deterministic models and the domain gap between the synthetic and real-world +degradation, we further introduce PiRN with Stochastic Refinement (PiRN-SR) to +boost its perceptual quality. Overall, our PiRN and PiRN-SR improve the +generalization to real-world unknown turbulence conditions and provide a +state-of-the-art restoration in both pixel-wise accuracy and perceptual +quality. Our codes are available at \url{https://github.com/VITA-Group/PiRN}. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and + Multi-View for 3D Object Retrieval + + +
+ To address 3D object retrieval, substantial efforts have been made to +generate highly discriminative descriptors of 3D objects represented by a +single modality, e.g., voxels, point clouds or multi-view images. It is +promising to leverage the complementary information from multi-modality +representations of 3D objects to further improve retrieval performance. +However, multi-modality 3D object retrieval is rarely developed and analyzed on +large-scale datasets. In this paper, we propose self-and-cross attention based +aggregation of point cloud and multi-view images (SCA-PVNet) for 3D object +retrieval. With deep features extracted from point clouds and multi-view +images, we design two types of feature aggregation modules, namely the +In-Modality Aggregation Module (IMAM) and the Cross-Modality Aggregation Module +(CMAM), for effective feature fusion. IMAM leverages a self-attention mechanism +to aggregate multi-view features while CMAM exploits a cross-attention +mechanism to interact point cloud features with multi-view features. The final +descriptor of a 3D object for object retrieval can be obtained via +concatenating the aggregated features from both modules. Extensive experiments +and analysis are conducted on three datasets, ranging from small to large +scale, to show the superiority of the proposed SCA-PVNet over the +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Event Blob Tracking: An Asynchronous Real-Time Algorithm + + +
+ Event-based cameras have become increasingly popular for tracking fast-moving +objects due to their high temporal resolution, low latency, and high dynamic +range. In this paper, we propose a novel algorithm for tracking event blobs +using raw events asynchronously in real time. We introduce the concept of an +event blob as a spatio-temporal likelihood of event occurrence where the +conditional spatial likelihood is blob-like. Many real-world objects generate +event blob data, for example, flickering LEDs such as car headlights or any +small foreground object moving against a static or slowly varying background. +The proposed algorithm uses a nearest neighbour classifier with a dynamic +threshold criteria for data association coupled with a Kalman filter to track +the event blob state. Our algorithm achieves highly accurate tracking and event +blob shape estimation even under challenging lighting conditions and high-speed +motions. The microsecond time resolution achieved means that the filter output +can be used to derive secondary information such as time-to-contact or range +estimation, that will enable applications to real-world problems such as +collision avoidance in autonomous driving. + +
+
+ comment: 17 pages, 8 figures, preprint version +
+
+
+
+
+ + ☆ Reference-based Painterly Inpainting via Diffusion: Crossing the Wild + Reference Domain Gap + + +
+ Have you ever imagined how it would look if we placed new objects into +paintings? For example, what would it look like if we placed a basketball into +Claude Monet's ``Water Lilies, Evening Effect''? We propose Reference-based +Painterly Inpainting, a novel task that crosses the wild reference domain gap +and implants novel objects into artworks. Although previous works have examined +reference-based inpainting, they are not designed for large domain +discrepancies between the target and the reference, such as inpainting an +artistic image using a photorealistic reference. This paper proposes a novel +diffusion framework, dubbed RefPaint, to ``inpaint more wildly'' by taking such +references with large domain gaps. Built with an image-conditioned diffusion +model, we introduce a ladder-side branch and a masked fusion mechanism to work +with the inpainting mask. By decomposing the CLIP image embeddings at inference +time, one can manipulate the strength of semantic and style information with +ease. Experiments demonstrate that our proposed RefPaint framework produces +significantly better results than existing methods. Our method enables creative +painterly image inpainting with reference objects that would otherwise be +difficult to achieve. Project page: https://vita-group.github.io/RefPaint/ + +
+
+
+
+
+ + ☆ Ethosight: A Joint-Embedding Based System for Nuanced Perception Using + Contextual Label Affinity Metric and Reasoning Based Iterative Learning + + +
+ Traditional computer vision models often require extensive manual effort for +data acquisition and validation, particularly when detecting subtle behavioral +nuances or events. The difficulty in distinguishing routine behaviors from +potential risks in real-world applications, like differentiating routine +shopping from potential shoplifting, further complicates the process. + We present Ethosight, a novel zero-shot computer vision algorithm. Ethosight +eradicates the need for pre-existing symbolic knowledge, initiating from a +clean slate based on user requirements and semantic knowledge of interest. +Using localized label affinity calculations and a reasoning-guided iterative +learning loop, Ethosight infers scene details and iteratively refines the label +set. Reasoning mechanisms can be derived from large language models like GPT4, +symbolic reasoners like OpenNARS, or hybrid systems. + Ethosight further capitalizes on the capabilities of a pre-trained +multi-modal model, ImageBind, generating accurate semantic knowledge of images +within a few cycles. It successfully captures both explicit and nuanced +elements efficiently. We also introduce the implementation of Korzybski's +"time-binding" concept in machines, which allows for generational learning and +knowledge sharing across deployments. + Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases. +It has exhibited an exceptional ability to discern new areas of interest, +consistently generating high-affinity scores within the top five labels from a +set of a thousand. Tests conducted across diverse environments attest to +Ethosight's robust performance. Detailed results and case studies within the +main body of this paper and an appendix underscore a promising trajectory +towards enhancing the adaptability and resilience of computer vision models in +detecting and extracting subtle and nuanced behaviors. + +
+
+
+
+
+ + ☆ Boosting Federated Learning Convergence with Prototype Regularization + + +
+ As a distributed machine learning technique, federated learning (FL) requires +clients to collaboratively train a shared model with an edge server without +leaking their local data. However, the heterogeneous data distribution among +clients often leads to a decrease in model performance. To tackle this issue, +this paper introduces a prototype-based regularization strategy to address the +heterogeneity in the data distribution. Specifically, the regularization +process involves the server aggregating local prototypes from distributed +clients to generate a global prototype, which is then sent back to the +individual clients to guide their local training. The experimental results on +MNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3% +and 8.9% in average test accuracy, respectively, compared to the most popular +baseline FedAvg. Furthermore, our approach has a fast convergence rate in +heterogeneous settings. + +
+
+
+
+
+ + ☆ No-frills Temporal Video Grounding: Multi-Scale Neighboring Attention + and Zoom-in Boundary Detection + + +
+ Temporal video grounding (TVG) aims to retrieve the time interval of a +language query from an untrimmed video. A significant challenge in TVG is the +low "Semantic Noise Ratio (SNR)", which results in worse performance with lower +SNR. Prior works have addressed this challenge using sophisticated techniques. +In this paper, we propose a no-frills TVG model that consists of two core +modules, namely multi-scale neighboring attention and zoom-in boundary +detection. The multi-scale neighboring attention restricts each video token to +only aggregate visual contexts from its neighbor, enabling the extraction of +the most distinguishing information with multi-scale feature hierarchies from +high-ratio noises. The zoom-in boundary detection then focuses on local-wise +discrimination of the selected top candidates for fine-grained grounding +adjustment. With an end-to-end training strategy, our model achieves +competitive performance on different TVG benchmarks, while also having the +advantage of faster inference speed and lighter model parameters, thanks to its +lightweight architecture. + +
+
+
+
+
+ + ☆ EMQ: Evolving Training-free Proxies for Automated Mixed Precision + Quantization ICCV2023 + + +
+ Mixed-Precision Quantization~(MQ) can achieve a competitive +accuracy-complexity trade-off for models. Conventional training-based search +methods require time-consuming candidate training to search optimized per-layer +bit-width configurations in MQ. Recently, some training-free approaches have +presented various MQ proxies and significantly improve search efficiency. +However, the correlation between these proxies and quantization accuracy is +poorly understood. To address the gap, we first build the MQ-Bench-101, which +involves different bit configurations and quantization results. Then, we +observe that the existing training-free proxies perform weak correlations on +the MQ-Bench-101. To efficiently seek superior proxies, we develop an automatic +search of proxies framework for MQ via evolving algorithms. In particular, we +devise an elaborate search space involving the existing proxies and perform an +evolution search to discover the best correlated MQ proxy. We proposed a +diversity-prompting selection strategy and compatibility screening protocol to +avoid premature convergence and improve search efficiency. In this way, our +Evolving proxies for Mixed-precision Quantization~(EMQ) framework allows the +auto-generation of proxies without heavy tuning and expert knowledge. Extensive +experiments on ImageNet with various ResNet and MobileNet families demonstrate +that our EMQ obtains superior performance than state-of-the-art mixed-precision +methods at a significantly reduced cost. The code will be released. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Dynamic Large Language Models on Blockchains + + +
+ Training and deploying the large language models requires a large mount of +computational resource because the language models contain billions of +parameters and the text has thousands of tokens. Another problem is that the +large language models are static. They are fixed after the training process. To +tackle these issues, in this paper, we propose to train and deploy the dynamic +large language model on blockchains, which have high computation performance +and are distributed across a network of computers. A blockchain is a secure, +decentralized, and transparent system that allows for the creation of a +tamper-proof ledger for transactions without the need for intermediaries. The +dynamic large language models can continuously learn from the user input after +the training process. Our method provides a new way to develop the large +language models and also sheds a light on the next generation artificial +intelligence systems. + +
+
+
+
+
+ + ☆ Interactive Segmentation for Diverse Gesture Types Without Context + + +
+ Interactive segmentation entails a human marking an image to guide how a +model either creates or edits a segmentation. Our work addresses limitations of +existing methods: they either only support one gesture type for marking an +image (e.g., either clicks or scribbles) or require knowledge of the gesture +type being employed, and require specifying whether marked regions should be +included versus excluded in the final segmentation. We instead propose a +simplified interactive segmentation task where a user only must mark an image, +where the input can be of any gesture type without specifying the gesture type. +We support this new task by introducing the first interactive segmentation +dataset with multiple gesture types as well as a new evaluation metric capable +of holistically evaluating interactive segmentation algorithms. We then analyze +numerous interactive segmentation algorithms, including ones adapted for our +novel task. While we observe promising performance overall, we also highlight +areas for future improvement. To facilitate further extensions of this work, we +publicly share our new dataset at https://github.com/joshmyersdean/dig. + +
+
+
+
+
+ + ☆ FedSoup: Improving Generalization and Personalization in Federated + Learning via Selective Model Interpolation MICCAI2023 + + +
+ Cross-silo federated learning (FL) enables the development of machine +learning models on datasets distributed across data centers such as hospitals +and clinical research laboratories. However, recent research has found that +current FL algorithms face a trade-off between local and global performance +when confronted with distribution shifts. Specifically, personalized FL methods +have a tendency to overfit to local data, leading to a sharp valley in the +local model and inhibiting its ability to generalize to out-of-distribution +data. In this paper, we propose a novel federated model soup method (i.e., +selective interpolation of model parameters) to optimize the trade-off between +local and global performance. Specifically, during the federated training +phase, each client maintains its own global model pool by monitoring the +performance of the interpolated model between the local and global models. This +allows us to alleviate overfitting and seek flat minima, which can +significantly improve the model's generalization performance. We evaluate our +method on retinal and pathological image classification tasks, and our proposed +method achieves significant improvements for out-of-distribution +generalization. Our code is available at https://github.com/ubc-tea/FedSoup. + +
+
+ comment: Accepted by MICCAI2023 +
+
+
+
+
+ + ☆ Is Grad-CAM Explainable in Medical Images? + + +
+ Explainable Deep Learning has gained significant attention in the field of +artificial intelligence (AI), particularly in domains such as medical imaging, +where accurate and interpretable machine learning models are crucial for +effective diagnosis and treatment planning. Grad-CAM is a baseline that +highlights the most critical regions of an image used in a deep learning +model's decision-making process, increasing interpretability and trust in the +results. It is applied in many computer vision (CV) tasks such as +classification and explanation. This study explores the principles of +Explainable Deep Learning and its relevance to medical imaging, discusses +various explainability techniques and their limitations, and examines medical +imaging applications of Grad-CAM. The findings highlight the potential of +Explainable Deep Learning and Grad-CAM in improving the accuracy and +interpretability of deep learning models in medical imaging. The code is +available in (will be available). + +
+
+
+
+
+ + ☆ Identifying Interpretable Subspaces in Image Representations ICML 2023 + + +
+ We propose Automatic Feature Explanation using Contrasting Concepts (FALCON), +an interpretability framework to explain features of image representations. For +a target feature, FALCON captions its highly activating cropped images using a +large captioning dataset (like LAION-400m) and a pre-trained vision-language +model like CLIP. Each word among the captions is scored and ranked leading to a +small number of shared, human-understandable concepts that closely describe the +target feature. FALCON also applies contrastive interpretation using lowly +activating (counterfactual) images, to eliminate spurious concepts. Although +many existing approaches interpret features independently, we observe in +state-of-the-art self-supervised and supervised models, that less than 20% of +the representation space can be explained by individual features. We show that +features in larger spaces become more interpretable when studied in groups and +can be explained with high-order scoring concepts through FALCON. We discuss +how extracted concepts can be used to explain and debug failures in downstream +tasks. Finally, we present a technique to transfer concepts from one +(explainable) representation space to another unseen representation space by +learning a simple linear transformation. + +
+
+ comment: Published at ICML 2023 +
+
+
+
+
+ + ☆ GLSFormer: Gated - Long, Short Sequence Transformer for Step Recognition + in Surgical Videos MICCAI 2023 + + +
+ Automated surgical step recognition is an important task that can +significantly improve patient safety and decision-making during surgeries. +Existing state-of-the-art methods for surgical step recognition either rely on +separate, multi-stage modeling of spatial and temporal information or operate +on short-range temporal resolution when learned jointly. However, the benefits +of joint modeling of spatio-temporal features and long-range information are +not taken in account. In this paper, we propose a vision transformer-based +approach to jointly learn spatio-temporal features directly from sequence of +frame-level patches. Our method incorporates a gated-temporal attention +mechanism that intelligently combines short-term and long-term spatio-temporal +feature representations. We extensively evaluate our approach on two cataract +surgery video datasets, namely Cataract-101 and D99, and demonstrate superior +performance compared to various state-of-the-art methods. These results +validate the suitability of our proposed approach for automated surgical step +recognition. Our code is released at: +https://github.com/nisargshah1999/GLSFormer + +
+
+ comment: Accepted to MICCAI 2023 (Early Accept) +
+
+
+
+
+ + ☆ SimCol3D -- 3D Reconstruction during Colonoscopy Challenge + + +
+ Colorectal cancer is one of the most common cancers in the world. While +colonoscopy is an effective screening technique, navigating an endoscope +through the colon to detect polyps is challenging. A 3D map of the observed +surfaces could enhance the identification of unscreened colon tissue and serve +as a training platform. However, reconstructing the colon from video footage +remains unsolved due to numerous factors such as self-occlusion, reflective +surfaces, lack of texture, and tissue deformation that limit feature-based +methods. Learning-based approaches hold promise as robust alternatives, but +necessitate extensive datasets. By establishing a benchmark, the 2022 EndoVis +sub-challenge SimCol3D aimed to facilitate data-driven depth and pose +prediction during colonoscopy. The challenge was hosted as part of MICCAI 2022 +in Singapore. Six teams from around the world and representatives from academia +and industry participated in the three sub-challenges: synthetic depth +prediction, synthetic pose prediction, and real pose prediction. This paper +describes the challenge, the submitted methods, and their results. We show that +depth prediction in virtual colonoscopy is robustly solvable, while pose +estimation remains an open research question. + +
+
+
+
+
+ + ☆ Towards Non-Parametric Models for Confidence Aware Image Prediction from + Low Data using Gaussian Processes + + +
+ The ability to envision future states is crucial to informed decision making +while interacting with dynamic environments. With cameras providing a prevalent +and information rich sensing modality, the problem of predicting future states +from image sequences has garnered a lot of attention. Current state of the art +methods typically train large parametric models for their predictions. Though +often able to predict with accuracy, these models rely on the availability of +large training datasets to converge to useful solutions. In this paper we focus +on the problem of predicting future images of an image sequence from very +little training data. To approach this problem, we use non-parametric models to +take a probabilistic approach to image prediction. We generate probability +distributions over sequentially predicted images and propagate uncertainty +through time to generate a confidence metric for our predictions. Gaussian +Processes are used for their data efficiency and ability to readily incorporate +new training data online. We showcase our method by successfully predicting +future frames of a smooth fluid simulation environment. + +
+
+
+
+
+ + ☆ Joint one-sided synthetic unpaired image translation and segmentation + for colorectal cancer prevention + + +
+ Deep learning has shown excellent performance in analysing medical images. +However, datasets are difficult to obtain due privacy issues, standardization +problems, and lack of annotations. We address these problems by producing +realistic synthetic images using a combination of 3D technologies and +generative adversarial networks. We propose CUT-seg, a joint training where a +segmentation model and a generative model are jointly trained to produce +realistic images while learning to segment polyps. We take advantage of recent +one-sided translation models because they use significantly less memory, +allowing us to add a segmentation model in the training loop. CUT-seg performs +better, is computationally less expensive, and requires less real images than +other memory-intensive image translation approaches that require two stage +training. Promising results are achieved on five real polyp segmentation +datasets using only one real image and zero real annotations. As a part of this +study we release Synth-Colon, an entirely synthetic dataset that includes 20000 +realistic colon images and additional details about depth and 3D geometry: +https://enric1994.github.io/synth-colon + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2202.08680 +
+
+
+
+
+ + ☆ UP-DP: Unsupervised Prompt Learning for Data Pre-Selection with + Vision-Language Models + + +
+ In this study, we investigate the task of data pre-selection, which aims to +select instances for labeling from an unlabeled dataset through a single pass, +thereby optimizing performance for undefined downstream tasks with a limited +annotation budget. Previous approaches to data pre-selection relied solely on +visual features extracted from foundation models, such as CLIP and BLIP-2, but +largely ignored the powerfulness of text features. In this work, we argue that, +with proper design, the joint feature space of both vision and text can yield a +better representation for data pre-selection. To this end, we introduce UP-DP, +a simple yet effective unsupervised prompt learning approach that adapts +vision-language models, like BLIP-2, for data pre-selection. Specifically, with +the BLIP-2 parameters frozen, we train text prompts to extract the joint +features with improved representation, ensuring a diverse cluster structure +that covers the entire dataset. We extensively compare our method with the +state-of-the-art using seven benchmark datasets in different settings, +achieving up to a performance gain of 20%. Interestingly, the prompts learned +from one dataset demonstrate significant generalizability and can be applied +directly to enhance the feature extraction of BLIP-2 from other datasets. To +the best of our knowledge, UP-DP is the first work to incorporate unsupervised +prompt learning in a vision-language model for data pre-selection. + +
+
+
+
+
+ + ☆ Heuristic Hyperparameter Choice for Image Anomaly Detection + + +
+ Anomaly detection (AD) in images is a fundamental computer vision problem by +deep learning neural network to identify images deviating significantly from +normality. The deep features extracted from pretrained models have been proved +to be essential for AD based on multivariate Gaussian distribution analysis. +However, since models are usually pretrained on a large dataset for +classification tasks such as ImageNet, they might produce lots of redundant +features for AD, which increases computational cost and degrades the +performance. We aim to do the dimension reduction of Negated Principal +Component Analysis (NPCA) for these features. So we proposed some heuristic to +choose hyperparameter of NPCA algorithm for getting as fewer components of +features as possible while ensuring a good performance. + +
+
+
+
+
+ + ☆ Towards General Game Representations: Decomposing Games Pixels into + Content and Style + + +
+ On-screen game footage contains rich contextual information that players +process when playing and experiencing a game. Learning pixel representations of +games can benefit artificial intelligence across several downstream tasks +including game-playing agents, procedural content generation, and player +modelling. The generalizability of these methods, however, remains a challenge, +as learned representations should ideally be shared across games with similar +game mechanics. This could allow, for instance, game-playing agents trained on +one game to perform well in similar games with no re-training. This paper +explores how generalizable pre-trained computer vision encoders can be for such +tasks, by decomposing the latent space into content embeddings and style +embeddings. The goal is to minimize the domain gap between games of the same +genre when it comes to game content critical for downstream tasks, and ignore +differences in graphical style. We employ a pre-trained Vision Transformer +encoder and a decomposition technique based on game genres to obtain separate +content and style embeddings. Our findings show that the decomposed embeddings +achieve style invariance across multiple games while still maintaining strong +content extraction capabilities. We argue that the proposed decomposition of +content and style offers better generalization capacities across game +environments independently of the downstream task. + +
+
+
+
+
+ + ☆ Frequency-aware optical coherence tomography image super-resolution via + conditional generative adversarial neural network + + +
+ Optical coherence tomography (OCT) has stimulated a wide range of medical +image-based diagnosis and treatment in fields such as cardiology and +ophthalmology. Such applications can be further facilitated by deep +learning-based super-resolution technology, which improves the capability of +resolving morphological structures. However, existing deep learning-based +method only focuses on spatial distribution and disregard frequency fidelity in +image reconstruction, leading to a frequency bias. To overcome this limitation, +we propose a frequency-aware super-resolution framework that integrates three +critical frequency-based modules (i.e., frequency transformation, frequency +skip connection, and frequency alignment) and frequency-based loss function +into a conditional generative adversarial network (cGAN). We conducted a +large-scale quantitative study from an existing coronary OCT dataset to +demonstrate the superiority of our proposed framework over existing deep +learning frameworks. In addition, we confirmed the generalizability of our +framework by applying it to fish corneal images and rat retinal images, +demonstrating its capability to super-resolve morphological details in eye +imaging. + +
+
+ comment: 13 pages, 7 figures, submitted to Biomedical Optics Express special + issue +
+
+
+
+
+ + ☆ Diffusion Sampling with Momentum for Mitigating Divergence Artifacts + + +
+ Despite the remarkable success of diffusion models in image generation, slow +sampling remains a persistent issue. To accelerate the sampling process, prior +studies have reformulated diffusion sampling as an ODE/SDE and introduced +higher-order numerical methods. However, these methods often produce divergence +artifacts, especially with a low number of sampling steps, which limits the +achievable acceleration. In this paper, we investigate the potential causes of +these artifacts and suggest that the small stability regions of these methods +could be the principal cause. To address this issue, we propose two novel +techniques. The first technique involves the incorporation of Heavy Ball (HB) +momentum, a well-known technique for improving optimization, into existing +diffusion numerical methods to expand their stability regions. We also prove +that the resulting methods have first-order convergence. The second technique, +called Generalized Heavy Ball (GHVB), constructs a new high-order method that +offers a variable trade-off between accuracy and artifact suppression. +Experimental results show that our techniques are highly effective in reducing +artifacts and improving image quality, surpassing state-of-the-art diffusion +solvers on both pixel-based and latent-based diffusion models for low-step +sampling. Our research provides novel insights into the design of numerical +methods for future diffusion work. + +
+
+ comment: Project page: https://github.com/sWizad/momentum-diffusion +
+
+
+
+
+ + ♻ ☆ Frequency Domain Adversarial Training for Robust Volumetric Medical + Segmentation MICCAI 2023 + + +
+ It is imperative to ensure the robustness of deep learning models in critical +applications such as, healthcare. While recent advances in deep learning have +improved the performance of volumetric medical image segmentation models, these +models cannot be deployed for real-world applications immediately due to their +vulnerability to adversarial attacks. We present a 3D frequency domain +adversarial attack for volumetric medical image segmentation models and +demonstrate its advantages over conventional input or voxel domain attacks. +Using our proposed attack, we introduce a novel frequency domain adversarial +training approach for optimizing a robust model against voxel and frequency +domain attacks. Moreover, we propose frequency consistency loss to regulate our +frequency domain adversarial training that achieves a better tradeoff between +model's performance on clean and adversarial samples. Code is publicly +available at https://github.com/asif-hanif/vafa. + +
+
+ comment: This paper has been accepted in MICCAI 2023 conference +
+
+
+
+
+ + ♻ ☆ Mitigating Calibration Bias Without Fixed Attribute Grouping for + Improved Fairness in Medical Imaging Analysis + + +
+ Trustworthy deployment of deep learning medical imaging models into +real-world clinical practice requires that they be calibrated. However, models +that are well calibrated overall can still be poorly calibrated for a +sub-population, potentially resulting in a clinician unwittingly making poor +decisions for this group based on the recommendations of the model. Although +methods have been shown to successfully mitigate biases across subgroups in +terms of model accuracy, this work focuses on the open problem of mitigating +calibration biases in the context of medical image analysis. Our method does +not require subgroup attributes during training, permitting the flexibility to +mitigate biases for different choices of sensitive attributes without +re-training. To this end, we propose a novel two-stage method: Cluster-Focal to +first identify poorly calibrated samples, cluster them into groups, and then +introduce group-wise focal loss to improve calibration bias. We evaluate our +method on skin lesion classification with the public HAM10000 dataset, and on +predicting future lesional activity for multiple sclerosis (MS) patients. In +addition to considering traditional sensitive attributes (e.g. age, sex) with +demographic subgroups, we also consider biases among groups with different +image-derived attributes, such as lesion load, which are required in medical +image analysis. Our results demonstrate that our method effectively controls +calibration error in the worst-performing subgroups while preserving prediction +performance, and outperforming recent baselines. + +
+
+
+
+
+ + ♻ ☆ LA-Net: Landmark-Aware Learning for Reliable Facial Expression + Recognition under Label Noise ICCV 2023 + + +
+ Facial expression recognition (FER) remains a challenging task due to the +ambiguity of expressions. The derived noisy labels significantly harm the +performance in real-world scenarios. To address this issue, we present a new +FER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks +to mitigate the impact of label noise from two perspectives. Firstly, LA-Net +uses landmark information to suppress the uncertainty in expression space and +constructs the label distribution of each sample by neighborhood aggregation, +which in turn improves the quality of training supervision. Secondly, the model +incorporates landmark information into expression representations using the +devised expression-landmark contrastive loss. The enhanced expression feature +extractor can be less susceptible to label noise. Our method can be integrated +with any deep neural network for better training supervision without +introducing extra inference costs. We conduct extensive experiments on both +in-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net +achieves state-of-the-art performance. + +
+
+ comment: accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Can point cloud networks learn statistical shape models of anatomies? MICCAI 2023 + + +
+ Statistical Shape Modeling (SSM) is a valuable tool for investigating and +quantifying anatomical variations within populations of anatomies. However, +traditional correspondence-based SSM generation methods have a prohibitive +inference process and require complete geometric proxies (e.g., high-resolution +binary volumes or surface meshes) as input shapes to construct the SSM. +Unordered 3D point cloud representations of shapes are more easily acquired +from various medical imaging practices (e.g., thresholded images and surface +scanning). Point cloud deep networks have recently achieved remarkable success +in learning permutation-invariant features for different point cloud tasks +(e.g., completion, semantic segmentation, classification). However, their +application to learning SSM from point clouds is to-date unexplored. In this +work, we demonstrate that existing point cloud encoder-decoder-based completion +networks can provide an untapped potential for SSM, capturing population-level +statistical representations of shapes while reducing the inference burden and +relaxing the input requirement. We discuss the limitations of these techniques +to the SSM application and suggest future improvements. Our work paves the way +for further exploration of point cloud deep learning for SSM, a promising +avenue for advancing shape analysis literature and broadening SSM to diverse +use cases. + +
+
+ comment: Accepted to MICCAI 2023. 13 pages, 5 figures, appendix +
+
+
+
+
+ + ♻ ☆ Fully Bayesian VIB-DeepSSM MICCAI 2023 + + +
+ Statistical shape modeling (SSM) enables population-based quantitative +analysis of anatomical shapes, informing clinical diagnosis. Deep learning +approaches predict correspondence-based SSM directly from unsegmented 3D images +but require calibrated uncertainty quantification, motivating Bayesian +formulations. Variational information bottleneck DeepSSM (VIB-DeepSSM) is an +effective, principled framework for predicting probabilistic shapes of anatomy +from images with aleatoric uncertainty quantification. However, VIB is only +half-Bayesian and lacks epistemic uncertainty inference. We derive a fully +Bayesian VIB formulation and demonstrate the efficacy of two scalable +implementation approaches: concrete dropout and batch ensemble. Additionally, +we introduce a novel combination of the two that further enhances uncertainty +calibration via multimodal marginalization. Experiments on synthetic shapes and +left atrium data demonstrate that the fully Bayesian VIB network predicts SSM +from images with improved uncertainty reasoning without sacrificing accuracy. + +
+
+ comment: Accepted to MICCAI 2023. 13 pages, 4 figures, appendix +
+
+
+
+
+ + ♻ ☆ MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model CVPR 2023 + + +
+ Multimodal semantic understanding often has to deal with uncertainty, which +means the obtained messages tend to refer to multiple targets. Such uncertainty +is problematic for our interpretation, including inter- and intra-modal +uncertainty. Little effort has studied the modeling of this uncertainty, +particularly in pre-training on unlabeled datasets and fine-tuning in +task-specific downstream datasets. In this paper, we project the +representations of all modalities as probabilistic distributions via a +Probability Distribution Encoder (PDE) by utilizing sequence-level +interactions. Compared to the existing deterministic methods, such uncertainty +modeling can convey richer multimodal semantic information and more complex +relationships. Furthermore, we integrate uncertainty modeling with popular +pre-training frameworks and propose suitable pre-training tasks: +Distribution-based Vision-Language Contrastive learning (D-VLC), +Distribution-based Masked Language Modeling (D-MLM), and Distribution-based +Image-Text Matching (D-ITM). The fine-tuned models are applied to challenging +downstream tasks, including image-text retrieval, visual question answering, +visual reasoning, and visual entailment, and achieve state-of-the-art results. + +
+
+ comment: CVPR 2023 Main Track Long Paper +
+
+
+
+
+ + ♻ ☆ Domain Adaptation based Enhanced Detection for Autonomous Driving in + Foggy and Rainy Weather + + +
+ Typically, object detection methods for autonomous driving that rely on +supervised learning make the assumption of a consistent feature distribution +between the training and testing data, however such assumption may fail in +different weather conditions. Due to the domain gap, a detection model trained +under clear weather may not perform well in foggy and rainy conditions. +Overcoming detection bottlenecks in foggy and rainy weather is a real challenge +for autonomous vehicles deployed in the wild. To bridge the domain gap and +improve the performance of object detectionin foggy and rainy weather, this +paper presents a novel framework for domain-adaptive object detection. The +adaptations at both the image-level and object-level are intended to minimize +the differences in image style and object appearance between domains. +Furthermore, in order to improve the model's performance on challenging +examples, we introduce a novel adversarial gradient reversal layer that +conducts adversarial mining on difficult instances in addition to domain +adaptation. Additionally, we suggest generating an auxiliary domain through +data augmentation to enforce a new domain-level metric regularization. +Experimental findings on public V2V benchmark exhibit a substantial enhancement +in object detection specifically for foggy and rainy driving scenarios. + +
+
+ comment: only change the title of this paper +
+
+
+
+
+ + ♻ ☆ Collaborative Perception in Autonomous Driving: Methods, Datasets and + Challenges + + +
+ Collaborative perception is essential to address occlusion and sensor failure +issues in autonomous driving. In recent years, theoretical and experimental +investigations of novel works for collaborative perception have increased +tremendously. So far, however, few reviews have focused on systematical +collaboration modules and large-scale collaborative perception datasets. This +work reviews recent achievements in this field to bridge this gap and motivate +future research. We start with a brief overview of collaboration schemes. After +that, we systematically summarize the collaborative perception methods for +ideal scenarios and real-world issues. The former focus on collaboration +modules and efficiency, and the latter is devoted to addressing the problems in +actual application. Furthermore, we present large-scale public datasets and +summarize quantitative results on these benchmarks. Finally, we highlight gaps +and overlooked challenges between current academic research and real-world +applications. + +
+
+ comment: 18 pages, 6 figures. Accepted by IEEE Intelligent Transportation + Systems Magazine. URL: + https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving +
+
+
+
+
+ + ♻ ☆ GSMorph: Gradient Surgery for cine-MRI Cardiac Deformable Registration MICCAI 2023 + + +
+ Deep learning-based deformable registration methods have been widely +investigated in diverse medical applications. Learning-based deformable +registration relies on weighted objective functions trading off registration +accuracy and smoothness of the deformation field. Therefore, they inevitably +require tuning the hyperparameter for optimal registration performance. Tuning +the hyperparameters is highly computationally expensive and introduces +undesired dependencies on domain knowledge. In this study, we construct a +registration model based on the gradient surgery mechanism, named GSMorph, to +achieve a hyperparameter-free balance on multiple losses. In GSMorph, we +reformulate the optimization procedure by projecting the gradient of similarity +loss orthogonally to the plane associated with the smoothness constraint, +rather than additionally introducing a hyperparameter to balance these two +competing terms. Furthermore, our method is model-agnostic and can be merged +into any deep registration network without introducing extra parameters or +slowing down inference. In this study, We compared our method with +state-of-the-art (SOTA) deformable registration approaches over two publicly +available cardiac MRI datasets. GSMorph proves superior to five SOTA +learning-based registration models and two conventional registration +techniques, SyN and Demons, on both registration accuracy and smoothness. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ High-order Tensor Pooling with Attention for Action Recognition + + +
+ We aim at capturing high-order statistics of feature vectors formed by a +neural network, and propose end-to-end second- and higher-order pooling to form +a tensor descriptor. Tensor descriptors require a robust similarity measure due +to low numbers of aggregated vectors and the burstiness phenomenon, when a +given feature appears more/less frequently than statistically expected. The +Heat Diffusion Process (HDP) on a graph Laplacian is closely related to the +Eigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix, +whose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN +play the same role, i.e., to boost or dampen the magnitude of the eigenspectrum +thus preventing the burstiness. We equip higher-order tensors with EPN which +acts as a spectral detector of higher-order occurrences to prevent burstiness. +We also prove that for a tensor of order r built from d dimensional feature +descriptors, such a detector gives the likelihood if at least one higher-order +occurrence is 'projected' into one of binom(d,r) subspaces represented by the +tensor; thus forming a tensor power normalization metric endowed with +binom(d,r) such 'detectors'. For experimental contributions, we apply several +second- and higher-order pooling variants to action recognition, provide +previously not presented comparisons of such pooling variants, and show +state-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Deep Graph Matching Based on Cycle Consistency + + +
+ We contribute to the sparsely populated area of unsupervised deep graph +matching with application to keypoint matching in images. Contrary to the +standard \emph{supervised} approach, our method does not require ground truth +correspondences between keypoint pairs. Instead, it is self-supervised by +enforcing consistency of matchings between images of the same object category. +As the matching and the consistency loss are discrete, their derivatives cannot +be straightforwardly used for learning. We address this issue in a principled +way by building our method upon the recent results on black-box differentiation +of combinatorial solvers. This makes our method exceptionally flexible, as it +is compatible with arbitrary network architectures and combinatorial solvers. +Our experimental evaluation suggests that our technique sets a new +state-of-the-art for unsupervised graph matching. + +
+
+ comment: 12 pages, 5 figures, 3 papers +
+
+
+
+
+ + ♻ ☆ Periocular Biometrics: A Modality for Unconstrained Scenarios + + +
+ Periocular refers to the externally visible region of the face that surrounds +the eye socket. This feature-rich area can provide accurate identification in +unconstrained or uncooperative scenarios, where the iris or face modalities may +not offer sufficient biometric cues due to factors such as partial occlusion or +high subject-to-camera distance. The COVID-19 pandemic has further highlighted +its importance, as the ocular region remained the only visible facial area even +in controlled settings due to the widespread use of masks. This paper discusses +the state of the art in periocular biometrics, presenting an overall framework +encompassing its most significant research aspects, which include: (a) ocular +definition, acquisition, and detection; (b) identity recognition, including +combination with other modalities and use of various spectra; and (c) ocular +soft-biometric analysis. Finally, we conclude by addressing current challenges +and proposing future directions. + +
+
+ comment: Published at IEEE Computer journal +
+
+
+
+
+ + ♻ ☆ Scaling Open-Vocabulary Object Detection + + +
+ Open-vocabulary object detection has benefited greatly from pretrained +vision-language models, but is still limited by the amount of available +detection training data. While detection training data can be expanded by using +Web image-text pairs as weak supervision, this has not been done at scales +comparable to image-level pretraining. Here, we scale up detection data with +self-training, which uses an existing detector to generate pseudo-box +annotations on image-text pairs. Major challenges in scaling self-training are +the choice of label space, pseudo-annotation filtering, and training +efficiency. We present the OWLv2 model and OWL-ST self-training recipe, which +address these challenges. OWLv2 surpasses the performance of previous +state-of-the-art open-vocabulary detectors already at comparable training +scales (~10M examples). However, with OWL-ST, we can scale to over 1B examples, +yielding further large improvement: With an L/14 architecture, OWL-ST improves +AP on LVIS rare classes, for which the model has seen no human box annotations, +from 31.2% to 44.6% (43% relative improvement). OWL-ST unlocks Web-scale +training for open-world localization, similar to what has been seen for image +classification and language modelling. + +
+
+
+
+
+ + ♻ ☆ Implicit Multidimensional Projection of Local Subspaces + + +
+ We propose a visualization method to understand the effect of +multidimensional projection on local subspaces, using implicit function +differentiation. Here, we understand the local subspace as the multidimensional +local neighborhood of data points. Existing methods focus on the projection of +multidimensional data points, and the neighborhood information is ignored. Our +method is able to analyze the shape and directional information of the local +subspace to gain more insights into the global structure of the data through +the perception of local structures. Local subspaces are fitted by +multidimensional ellipses that are spanned by basis vectors. An accurate and +efficient vector transformation method is proposed based on analytical +differentiation of multidimensional projections formulated as implicit +functions. The results are visualized as glyphs and analyzed using a full set +of specifically-designed interactions supported in our efficient web-based +visualization tool. The usefulness of our method is demonstrated using various +multi- and high-dimensional benchmark datasets. Our implicit differentiation +vector transformation is evaluated through numerical comparisons; the overall +method is evaluated through exploration examples and use cases. + +
+
+
+
+
+ + ♻ ☆ Implicit Identity Representation Conditioned Memory Compensation Network + for Talking Head video Generation ICCV2023 + + +
+ Talking head video generation aims to animate a human face in a still image +with dynamic poses and expressions using motion information derived from a +target-driving video, while maintaining the person's identity in the source +image. However, dramatic and complex motions in the driving video cause +ambiguous generation, because the still source image cannot provide sufficient +appearance information for occluded regions or delicate expression variations, +which produces severe artifacts and significantly degrades the generation +quality. To tackle this problem, we propose to learn a global facial +representation space, and design a novel implicit identity representation +conditioned memory compensation network, coined as MCNet, for high-fidelity +talking head generation.~Specifically, we devise a network module to learn a +unified spatial facial meta-memory bank from all training samples, which can +provide rich facial structure and appearance priors to compensate warped source +facial features for the generation. Furthermore, we propose an effective query +mechanism based on implicit identity representations learned from the discrete +keypoints of the source image. It can greatly facilitate the retrieval of more +correlated information from the memory bank for the compensation. Extensive +experiments demonstrate that MCNet can learn representative and complementary +facial memory, and can clearly outperform previous state-of-the-art talking +head generation methods on VoxCeleb1 and CelebV datasets. Please check our +\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}. + +
+
+ comment: Accepted by ICCV2023, update the reference and figures +
+
+
+
+
+ + ♻ ☆ Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis MICCAI 2023 + + +
+ Regular group convolutional neural networks (G-CNNs) have been shown to +increase model performance and improve equivariance to different geometrical +symmetries. This work addresses the problem of SE(3), i.e., roto-translation +equivariance, on volumetric data. Volumetric image data is prevalent in many +medical settings. Motivated by the recent work on separable group convolutions, +we devise a SE(3) group convolution kernel separated into a continuous SO(3) +(rotation) kernel and a spatial kernel. We approximate equivariance to the +continuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel +is parameterized via RBF interpolation on similarly uniform grids. We +demonstrate the advantages of our approach in volumetric medical image +analysis. Our SE(3) equivariant models consistently outperform CNNs and regular +discrete G-CNNs on challenging medical classification tasks and show +significantly improved generalization capabilities. Our approach achieves up to +a 16.5% gain in accuracy over regular CNNs. + +
+
+ comment: 10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated + version to camera ready version 1 +
+
+
+
+
+ + ♻ ☆ Detecting Images Generated by Deep Diffusion Models using their Local + Intrinsic Dimensionality + + +
+ Diffusion models recently have been successfully applied for the visual +synthesis of strikingly realistic appearing images. This raises strong concerns +about their potential for malicious purposes. In this paper, we propose using +the lightweight multi Local Intrinsic Dimensionality (multiLID), which has been +originally developed in context of the detection of adversarial examples, for +the automatic detection of synthetic images and the identification of the +according generator networks. In contrast to many existing detection +approaches, which often only work for GAN-generated images, the proposed method +provides close to perfect detection results in many realistic use cases. +Extensive experiments on known and newly created datasets demonstrate that the +proposed multiLID approach exhibits superiority in diffusion detection and +model identification. Since the empirical evaluations of recent publications on +the detection of generated images are often mainly focused on the +"LSUN-Bedroom" dataset, we further establish a comprehensive benchmark for the +detection of diffusion-generated images, including samples from several +diffusion models with different image sizes. + +
+
+
+
+
+ + ♻ ☆ UW-ProCCaps: UnderWater Progressive Colourisation with Capsules + + +
+ Underwater images are fundamental for studying and understanding the status +of marine life. We focus on reducing the memory space required for image +storage while the memory space consumption in the collecting phase limits the +time lasting of this phase leading to the need for more image collection +campaigns. We present a novel machine-learning model that reconstructs the +colours of underwater images from their luminescence channel, thus saving 2/3 +of the available storage space. Our model specialises in underwater colour +reconstruction and consists of an encoder-decoder architecture. The encoder is +composed of a convolutional encoder and a parallel specialised classifier +trained with webly-supervised data. The encoder and the decoder use layers of +capsules to capture the features of the entities in the image. The colour +reconstruction process recalls the progressive and the generative adversarial +training procedures. The progressive training gives the ground for a generative +adversarial routine focused on the refining of colours giving the image bright +and saturated colours which bring the image back to life. We validate the model +both qualitatively and quantitatively on four benchmark datasets. This is the +first attempt at colour reconstruction in greyscale underwater images. +Extensive results on four benchmark datasets demonstrate that our solution +outperforms state-of-the-art (SOTA) solutions. We also demonstrate that the +generated colourisation enhances the quality of images compared to enhancement +models at the SOTA. + +
+
+
+
+
+ + ♻ ☆ Vicinity Vision Transformer + + +
+ Vision transformers have shown great success on numerous computer vision +tasks. However, its central component, softmax attention, prohibits vision +transformers from scaling up to high-resolution images, due to both the +computational complexity and memory footprint being quadratic. Although linear +attention was introduced in natural language processing (NLP) tasks to mitigate +a similar issue, directly applying existing linear attention to vision +transformers may not lead to satisfactory results. We investigate this problem +and find that computer vision tasks focus more on local information compared +with NLP tasks. Based on this observation, we present a Vicinity Attention that +introduces a locality bias to vision transformers with linear complexity. +Specifically, for each image patch, we adjust its attention weight based on its +2D Manhattan distance measured by its neighbouring patches. In this case, the +neighbouring patches will receive stronger attention than far-away patches. +Moreover, since our Vicinity Attention requires the token length to be much +larger than the feature dimension to show its efficiency advantages, we further +propose a new Vicinity Vision Transformer (VVT) structure to reduce the feature +dimension without degenerating the accuracy. We perform extensive experiments +on the CIFAR100, ImageNet1K, and ADE20K datasets to validate the effectiveness +of our method. Our method has a slower growth rate of GFlops than previous +transformer-based and convolution-based networks when the input resolution +increases. In particular, our approach achieves state-of-the-art image +classification accuracy with 50% fewer parameters than previous methods. + +
+
+ comment: code: https://github.com/OpenNLPLab/Vicinity-Vision-Transformer +
+
+
+
+
+ + ♻ ☆ Drone navigation and license place detection for vehicle location in + indoor spaces + + +
+ Millions of vehicles are transported every year, tightly parked in vessels or +boats. To reduce the risks of associated safety issues like fires, knowing the +location of vehicles is essential, since different vehicles may need different +mitigation measures, e.g. electric cars. This work is aimed at creating a +solution based on a nano-drone that navigates across rows of parked vehicles +and detects their license plates. We do so via a wall-following algorithm, and +a CNN trained to detect license plates. All computations are done in real-time +on the drone, which just sends position and detected images that allow the +creation of a 2D map with the position of the plates. Our solution is capable +of reading all plates across eight test cases (with several rows of plates, +different drone speeds, or low light) by aggregation of measurements across +several drone journeys. + +
+
+ comment: Published at VIII International Workshop on Artificial Intelligence + and Pattern Recognition, IWAIPR 2023 +
+
+
+
+
+ + ♻ ☆ HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory + Prediction via Scene Encoding + + +
+ Encoding a driving scene into vector representations has been an essential +task for autonomous driving that can benefit downstream tasks e.g. trajectory +prediction. The driving scene often involves heterogeneous elements such as the +different types of objects (agents, lanes, traffic signs) and the semantic +relations between objects are rich and diverse. Meanwhile, there also exist +relativity across elements, which means that the spatial relation is a relative +concept and need be encoded in a ego-centric manner instead of in a global +coordinate system. Based on these observations, we propose Heterogeneous +Driving Graph Transformer (HDGT), a backbone modelling the driving scene as a +heterogeneous graph with different types of nodes and edges. For heterogeneous +graph construction, we connect different types of nodes according to diverse +semantic relations. For spatial relation encoding, the coordinates of the node +as well as its in-edges are in the local node-centric coordinate system. For +the aggregation module in the graph neural network (GNN), we adopt the +transformer structure in a hierarchical way to fit the heterogeneous nature of +inputs. Experimental results show that HDGT achieves state-of-the-art +performance for the task of trajectory prediction, on INTERACTION Prediction +Challenge and Waymo Open Motion Challenge. + +
+
+ comment: Accepted at IEEE TPAMI in 2023. Code url: + https://github.com/OpenDriveLab/HDGT +
+
+
+
+
+ + ♻ ☆ Navya3DSeg -- Navya 3D Semantic Segmentation Dataset & split generation + for autonomous vehicles + + +
+ Autonomous driving (AD) perception today relies heavily on deep learning +based architectures requiring large scale annotated datasets with their +associated costs for curation and annotation. The 3D semantic data are useful +for core perception tasks such as obstacle detection and ego-vehicle +localization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg), +with a diverse label space corresponding to a large scale production grade +operational domain, including rural, urban, industrial sites and universities +from 13 countries. It contains 23 labeled sequences and 25 supplementary +sequences without labels, designed to explore self-supervised and +semi-supervised semantic segmentation benchmarks on point clouds. We also +propose a novel method for sequential dataset split generation based on +iterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU +improvement over the original split proposed by SemanticKITTI dataset. A +complete benchmark for semantic segmentation task was performed, with state of +the art methods. Finally, we demonstrate an Active Learning (AL) based dataset +distillation framework. We introduce a novel heuristic-free sampling method +called ego-pose distance based sampling in the context of AL. A detailed +presentation on the dataset is available here +https://www.youtube.com/watch?v=5m6ALIs-s20. + +
+
+ comment: Accepted version to IEEE RA-L. Version with supplementary materials +
+
+
+
+
+ + ♻ ☆ Positive-Augmented Contrastive Learning for Image and Video Captioning + Evaluation CVPR 2023 + + +
+ The CLIP model has been recently proven to be very effective for a variety of +cross-modal tasks, including the evaluation of captions generated from +vision-and-language architectures. In this paper, we propose a new recipe for a +contrastive-based evaluation metric for image captioning, namely +Positive-Augmented Contrastive learning Score (PAC-S), that in a novel way +unifies the learning of a contrastive visual-semantic space with the addition +of generated images and text on curated data. Experiments spanning several +datasets demonstrate that our new metric achieves the highest correlation with +human judgments on both images and videos, outperforming existing +reference-based metrics like CIDEr and SPICE and reference-free metrics like +CLIP-Score. Finally, we test the system-level correlation of the proposed +metric when considering popular image captioning approaches, and assess the +impact of employing different cross-modal features. Our source code and trained +models are publicly available at: https://github.com/aimagelab/pacscore. + +
+
+ comment: CVPR 2023 (highlight paper) +
+
+
+
+
+ + ♻ ☆ Reading Radiology Imaging Like The Radiologist + + +
+ Automated radiology report generation aims to generate radiology reports that +contain rich, fine-grained descriptions of radiology imaging. Compared with +image captioning in the natural image domain, medical images are very similar +to each other, with only minor differences in the occurrence of diseases. Given +the importance of these minor differences in the radiology report, it is +crucial to encourage the model to focus more on the subtle regions of disease +occurrence. Secondly, the problem of visual and textual data biases is serious. +Not only do normal cases make up the majority of the dataset, but sentences +describing areas with pathological changes also constitute only a small part of +the paragraph. Lastly, generating medical image reports involves the challenge +of long text generation, which requires more expertise and empirical training +in medical knowledge. As a result, the difficulty of generating such reports is +increased. To address these challenges, we propose a disease-oriented retrieval +framework that utilizes similar reports as prior knowledge references. We +design a factual consistency captioning generator to generate more accurate and +factually consistent disease descriptions. Our framework can find most similar +reports for a given disease from the CXR database by retrieving a +disease-oriented mask consisting of the position and morphological +characteristics. By referencing the disease-oriented similar report and the +visual features, the factual consistency model can generate a more accurate +radiology report. + +
+
+ comment: There are data writing errors in the paper +
+
+
+
+
+ + ♻ ☆ Unsupervised 3D registration through optimization-guided cyclical + self-training MICCAI 2023 + + +
+ State-of-the-art deep learning-based registration methods employ three +different learning strategies: supervised learning, which requires costly +manual annotations, unsupervised learning, which heavily relies on hand-crafted +similarity metrics designed by domain experts, or learning from synthetic data, +which introduces a domain shift. To overcome the limitations of these +strategies, we propose a novel self-supervised learning paradigm for +unsupervised registration, relying on self-training. Our idea is based on two +key insights. Feature-based differentiable optimizers 1) perform reasonable +registration even from random features and 2) stabilize the training of the +preceding feature extraction network on noisy labels. Consequently, we propose +cyclical self-training, where pseudo labels are initialized as the displacement +fields inferred from random features and cyclically updated based on more and +more expressive features from the learning feature extractor, yielding a +self-reinforcement effect. We evaluate the method for abdomen and lung +registration, consistently surpassing metric-based supervision and +outperforming diverse state-of-the-art competitors. Source code is available at +https://github.com/multimodallearning/reg-cyclical-self-train. + +
+
+ comment: accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ Convolutional Neural Networks (CNNs) have made significant strides in medical +image analysis in recent years. However, the local nature of the convolution +operator may pose a limitation for capturing global and long-range interactions +in CNNs. Recently, Transformers have gained popularity in the computer vision +community and also medical image segmentation due to their ability to process +global features effectively. The scalability issues of self-attention mechanism +and lack of the CNN-like inductive bias may have limited their adoption. +Therefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages +of both Convolution and Self-attention Mechanisms, have gained importance. In +this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision +transformer (CNN-Transformer) for medical image segmentation. The proposed +Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both +the convolution and self-attention mechanisms at each decoding stage with +nominal computational burden. The inclusion of multi-axis self-attention, +within each decoder stage, significantly enhances the discriminating capacity +between the object and background regions, and thereby helps in improving the +segmentation efficiency. In the Hybrid Decoder block, the fusion process +commences by integrating the upsampled lower level decoder features, obtained +through transpose convolution, with the skip-connection features derived from +the hybrid encoder. Subsequently, the fused features undergo refinement through +the utilization of a multi-axis attention mechanism. The proposed decoder block +is repeated multiple times to progressively segment the nuclei regions. +Experimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the +effectiveness of the proposed technique. + +
+
+
+
+
+ + ♻ ☆ RegFormer: An Efficient Projection-Aware Transformer Network for + Large-Scale Point Cloud Registration ICCV2023 + + +
+ Although point cloud registration has achieved remarkable advances in +object-level and indoor scenes, large-scale registration methods are rarely +explored. Challenges mainly arise from the huge point number, complex +distribution, and outliers of outdoor LiDAR scans. In addition, most existing +registration works generally adopt a two-stage paradigm: They first find +correspondences by extracting discriminative local features, and then leverage +estimators (eg. RANSAC) to filter outliers, which are highly dependent on +well-designed descriptors and post-processing choices. To address these +problems, we propose an end-to-end transformer network (RegFormer) for +large-scale point cloud alignment without any further post-processing. +Specifically, a projection-aware hierarchical transformer is proposed to +capture long-range dependencies and filter outliers by extracting point +features globally. Our transformer has linear complexity, which guarantees high +efficiency even for large-scale scenes. Furthermore, to effectively reduce +mismatches, a bijective association transformer is designed for regressing the +initial transformation. Extensive experiments on KITTI and NuScenes datasets +demonstrate that our RegFormer achieves competitive performance in terms of +both accuracy and efficiency. + +
+
+ comment: Accepted by ICCV2023. Codes will be released at + https://github.com/IRMVLab/RegFormer +
+
+
+
+
+ + ♻ ☆ Positive unlabeled learning with tensor networks + + +
+ Positive unlabeled learning is a binary classification problem with positive +and unlabeled data. It is common in domains where negative labels are costly or +impossible to obtain, e.g., medicine and personalized advertising. Most +approaches to positive unlabeled learning apply to specific data types (e.g., +images, categorical data) and can not generate new positive and negative +samples. This work introduces a feature-space distance-based tensor network +approach to the positive unlabeled learning problem. The presented method is +not domain specific and significantly improves the state-of-the-art results on +the MNIST image and 15 categorical/mixed datasets. The trained tensor network +model is also a generative model and enables the generation of new positive and +negative instances. + +
+
+ comment: 12 pages, 6 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Event Camera Data Pre-training ICCV 2023 + + +
+ This paper proposes a pre-trained neural network for handling event camera +data. Our model is a self-supervised learning framework, and uses paired event +camera data and natural RGB images for training. + Our method contains three modules connected in a sequence: i) a family of +event data augmentations, generating meaningful event images for +self-supervised training; ii) a conditional masking strategy to sample +informative event patches from event images, encouraging our model to capture +the spatial layout of a scene and accelerating training; iii) a contrastive +learning approach, enforcing the similarity of embeddings between matching +event images, and between paired event and RGB images. An embedding projection +loss is proposed to avoid the model collapse when enforcing the event image +embedding similarities. A probability distribution alignment loss is proposed +to encourage the event image to be consistent with its paired RGB image in the +feature space. + Transfer learning performance on downstream tasks shows the superiority of +our method over state-of-the-art methods. For example, we achieve top-1 +accuracy at 64.83% on the N-ImageNet dataset. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ MotionBERT: A Unified Perspective on Learning Human Motion + Representations ICCV 2023 + + +
+ We present a unified perspective on tackling various human-centric video +tasks by learning human motion representations from large-scale and +heterogeneous data resources. Specifically, we propose a pretraining stage in +which a motion encoder is trained to recover the underlying 3D motion from +noisy partial 2D observations. The motion representations acquired in this way +incorporate geometric, kinematic, and physical knowledge about human motion, +which can be easily transferred to multiple downstream tasks. We implement the +motion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer) +neural network. It could capture long-range spatio-temporal relationships among +the skeletal joints comprehensively and adaptively, exemplified by the lowest +3D pose estimation error so far when trained from scratch. Furthermore, our +proposed framework achieves state-of-the-art performance on all three +downstream tasks by simply finetuning the pretrained motion encoder with a +simple regression head (1-2 layers), which demonstrates the versatility of the +learned motion representations. Code and models are available at +https://motionbert.github.io/ + +
+
+ comment: ICCV 2023 version +
+
+
+
+
+ + ♻ ☆ Estimation of control area in badminton doubles with pose information + from top and back view drone videos + + +
+ The application of visual tracking to the performance analysis of sports +players in dynamic competitions is vital for effective coaching. In doubles +matches, coordinated positioning is crucial for maintaining control of the +court and minimizing opponents' scoring opportunities. The analysis of such +teamwork plays a vital role in understanding the dynamics of the game. However, +previous studies have primarily focused on analyzing and assessing singles +players without considering occlusion in broadcast videos. These studies have +relied on discrete representations, which involve the analysis and +representation of specific actions (e.g., strokes) or events that occur during +the game while overlooking the meaningful spatial distribution. In this work, +we present the first annotated drone dataset from top and back views in +badminton doubles and propose a framework to estimate the control area +probability map, which can be used to evaluate teamwork performance. We present +an efficient framework of deep neural networks that enables the calculation of +full probability surfaces. This framework utilizes the embedding of a Gaussian +mixture map of players' positions and employs graph convolution on their poses. +In the experiment, we verify our approach by comparing various baselines and +discovering the correlations between the score and control area. Additionally, +we propose a practical application for assessing optimal positioning to provide +instructions during a game. Our approach offers both visual and quantitative +evaluations of players' movements, thereby providing valuable insights into +doubles teamwork. The dataset and related project code is available at +https://github.com/Ning-D/Drone_BD_ControlArea + +
+
+ comment: 15 pages, 10 figures, to appear in Multimedia Tools and Applications +
+
+
+
+
+ + ♻ ☆ Class Attention to Regions of Lesion for Imbalanced Medical Image + Recognition + + +
+ Automated medical image classification is the key component in intelligent +diagnosis systems. However, most medical image datasets contain plenty of +samples of common diseases and just a handful of rare ones, leading to major +class imbalances. Currently, it is an open problem in intelligent diagnosis to +effectively learn from imbalanced training data. In this paper, we propose a +simple yet effective framework, named \textbf{C}lass \textbf{A}ttention to +\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by +embedding attention into the training process of \textbf{C}onvolutional +\textbf{N}eural \textbf{N}etworks (CNNs). The proposed attention module helps +CNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn +their characteristics more effectively. In addition, this attention module +works only during the training phase and does not change the architecture of +the original network, so it can be directly combined with any existing CNN +architecture. The CARE framework needs bounding boxes to represent the lesion +regions of rare diseases. To alleviate the need for manual annotation, we +further developed variants of CARE by leveraging the traditional saliency +methods or a pretrained segmentation model for bounding box generation. Results +show that the CARE variants with automated bounding box generation are +comparable to the original CARE framework with \textit{manual} bounding box +annotations. A series of experiments on an imbalanced skin image dataset and a +pneumonia dataset indicates that our method can effectively help the network +focus on the lesion regions of rare diseases and remarkably improves the +classification performance of rare diseases. + +
+
+ comment: Accepted by Neurocomputing on July 2023. 37 pages +
+
+
+
+
+ + ♻ ☆ AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks ICCV 2023 + + +
+ To deliver the artistic expression of the target style, recent studies +exploit the attention mechanism owing to its ability to map the local patches +of the style image to the corresponding patches of the content image. However, +because of the low semantic correspondence between arbitrary content and +artworks, the attention module repeatedly abuses specific local patches from +the style image, resulting in disharmonious and evident repetitive artifacts. +To overcome this limitation and accomplish impeccable artistic style transfer, +we focus on enhancing the attention mechanism and capturing the rhythm of +patterns that organize the style. In this paper, we introduce a novel metric, +namely pattern repeatability, that quantifies the repetition of patterns in the +style image. Based on the pattern repeatability, we propose Aesthetic +Pattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot +of local and global style expressions. In addition, we propose a novel +self-supervisory task to encourage the attention mechanism to learn precise and +meaningful semantic correspondence. Lastly, we introduce the patch-wise style +loss to transfer the elaborate rhythm of local patterns. Through qualitative +and quantitative evaluations, we verify the reliability of the proposed pattern +repeatability that aligns with human perception, and demonstrate the +superiority of the proposed framework. + +
+
+ comment: Accepted by ICCV 2023. Code is available at this + https://github.com/Kibeom-Hong/AesPA-Net +
+
+
+
+
+ + ♻ ☆ MotionGPT: Human Motion as a Foreign Language + + +
+ Though the advancement of pre-trained large language models unfolds, the +exploration of building a unified model for language and other multi-modal +data, such as motion, remains challenging and untouched so far. Fortunately, +human motion displays a semantic coupling akin to human language, often +perceived as a form of body language. By fusing language data with large-scale +motion models, motion-language pre-training that can enhance the performance of +motion-related tasks becomes feasible. Driven by this insight, we propose +MotionGPT, a unified, versatile, and user-friendly motion-language model to +handle multiple motion-relevant tasks. Specifically, we employ the discrete +vector quantization for human motion and transfer 3D motion into motion tokens, +similar to the generation process of word tokens. Building upon this "motion +vocabulary", we perform language modeling on both motion and text in a unified +manner, treating human motion as a specific language. Moreover, inspired by +prompt learning, we pre-train MotionGPT with a mixture of motion-language data +and fine-tune it on prompt-based question-and-answer tasks. Extensive +experiments demonstrate that MotionGPT achieves state-of-the-art performances +on multiple motion tasks including text-driven motion generation, motion +captioning, motion prediction, and motion in-between. + +
+
+ comment: Project Page: https://github.com/OpenMotionLab/MotionGPT +
+
+
+
+
+ + ♻ ☆ DETReg: Unsupervised Pretraining with Region Priors for Object Detection + + +
+ Recent self-supervised pretraining methods for object detection largely focus +on pretraining the backbone of the object detector, neglecting key parts of +detection architecture. Instead, we introduce DETReg, a new self-supervised +method that pretrains the entire object detection network, including the object +localization and embedding components. During pretraining, DETReg predicts +object localizations to match the localizations from an unsupervised region +proposal generator and simultaneously aligns the corresponding feature +embeddings with embeddings from a self-supervised image encoder. We implement +DETReg using the DETR family of detectors and show that it improves over +competitive baselines when finetuned on COCO, PASCAL VOC, and Airbus Ship +benchmarks. In low-data regimes DETReg achieves improved performance, e.g., +when training with only 1% of the labels and in the few-shot learning settings. + +
+
+ comment: Project page: https://www.amirbar.net/detreg/ +
+
+
+
+
+ + ♻ ☆ Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via + Geometry-Guided Cross-View Transformer ICCV 2023 + + +
+ Image retrieval-based cross-view localization methods often lead to very +coarse camera pose estimation, due to the limited sampling density of the +database satellite images. In this paper, we propose a method to increase the +accuracy of a ground camera's location and orientation by estimating the +relative rotation and translation between the ground-level image and its +matched/retrieved satellite image. Our approach designs a geometry-guided +cross-view transformer that combines the benefits of conventional geometry and +learnable cross-view transformers to map the ground-view observations to an +overhead view. Given the synthesized overhead view and observed satellite +feature maps, we construct a neural pose optimizer with strong global +information embedding ability to estimate the relative rotation between them. +After aligning their rotations, we develop an uncertainty-guided spatial +correlation to generate a probability map of the vehicle locations, from which +the relative translation can be determined. Experimental results demonstrate +that our method significantly outperforms the state-of-the-art. Notably, the +likelihood of restricting the vehicle lateral pose to be within 1m of its +Ground Truth (GT) value on the cross-view KITTI dataset has been improved from +$35.54\%$ to $76.44\%$, and the likelihood of restricting the vehicle +orientation to be within $1^{\circ}$ of its GT value has been improved from +$19.64\%$ to $99.10\%$. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Semantically Adversarial Scenario Generation with Explicit Knowledge + Guidance + + +
+ Generating adversarial scenarios, which have the potential to fail autonomous +driving systems, provides an effective way to improve robustness. Extending +purely data-driven generative models, recent specialized models satisfy +additional controllable requirements such as embedding a traffic sign in a +driving scene by manipulating patterns implicitly in the neuron level. In this +paper, we introduce a method to incorporate domain knowledge explicitly in the +generation process to achieve the Semantically Adversarial Generation (SAG). To +be consistent with the composition of driving scenes, we first categorize the +knowledge into two types, the property of objects and the relationship among +objects. We then propose a tree-structured variational auto-encoder (T-VAE) to +learn hierarchical scene representation. By imposing semantic rules on the +properties of nodes and edges in the tree structure, explicit knowledge +integration enables controllable generation. We construct a synthetic example +to illustrate the controllability and explainability of our method in a +succinct setting. We further extend to realistic environments for autonomous +vehicles: our method efficiently identifies adversarial driving scenes against +different state-of-the-art 3D point cloud segmentation models and satisfies the +traffic rules specified as the explicit knowledge. + +
+
+ comment: 20 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Preprocessors Matter! Realistic Decision-Based Attacks on Machine + Learning Systems ICML 2023 + + +
+ Decision-based attacks construct adversarial examples against a machine +learning (ML) model by making only hard-label queries. These attacks have +mainly been applied directly to standalone neural networks. However, in +practice, ML models are just one component of a larger learning system. We find +that by adding a single preprocessor in front of a classifier, state-of-the-art +query-based attacks are up to 7$\times$ less effective at attacking a +prediction pipeline than at attacking the model alone. We explain this +discrepancy by the fact that most preprocessors introduce some notion of +invariance to the input space. Hence, attacks that are unaware of this +invariance inevitably waste a large number of queries to re-discover or +overcome it. We, therefore, develop techniques to (i) reverse-engineer the +preprocessor and then (ii) use this extracted information to attack the +end-to-end system. Our preprocessors extraction method requires only a few +hundred queries, and our preprocessor-aware attacks recover the same efficacy +as when attacking the model alone. The code can be found at +https://github.com/google-research/preprocessor-aware-black-box-attack. + +
+
+ comment: ICML 2023. Code can be found at + https://github.com/google-research/preprocessor-aware-black-box-attack +
+
+
+
+
+ + ♻ ☆ Open Challenges for Monocular Single-shot 6D Object Pose Estimation + + +
+ Object pose estimation is a non-trivial task that enables robotic +manipulation, bin picking, augmented reality, and scene understanding, to name +a few use cases. Monocular object pose estimation gained considerable momentum +with the rise of high-performing deep learning-based solutions and is +particularly interesting for the community since sensors are inexpensive and +inference is fast. Prior works establish the comprehensive state of the art for +diverse pose estimation problems. Their broad scopes make it difficult to +identify promising future directions. We narrow down the scope to the problem +of single-shot monocular 6D object pose estimation, which is commonly used in +robotics, and thus are able to identify such trends. By reviewing recent +publications in robotics and computer vision, the state of the art is +established at the union of both fields. Following that, we identify promising +research directions in order to help researchers to formulate relevant research +ideas and effectively advance the state of the art. Findings include that +methods are sophisticated enough to overcome the domain shift and that +occlusion handling is a fundamental challenge. We also highlight problems such +as novel object pose estimation and challenging materials handling as central +challenges to advance robotics. + +
+
+ comment: Revised version in the making +
+
+
+
+
+ + ♻ ☆ Tranfer Learning of Semantic Segmentation Methods for Identifying Buried + Archaeological Structures on LiDAR Data + + +
+ When applying deep learning to remote sensing data in archaeological +research, a notable obstacle is the limited availability of suitable datasets +for training models. The application of transfer learning is frequently +employed to mitigate this drawback. However, there is still a need to explore +its effectiveness when applied across different archaeological datasets. This +paper compares the performance of various transfer learning configurations +using two semantic segmentation deep neural networks on two LiDAR datasets. The +experimental results indicate that transfer learning-based approaches in +archaeology can lead to performance improvements, although a systematic +enhancement has not yet been observed. We provide specific insights about the +validity of such techniques that can serve as a baseline for future works. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2023 (IGARSS 2023) @IEEE copyright +
+
+
+
+
+ + ♻ ☆ ADPS: Asymmetric Distillation Post-Segmentation Method for Image Anomaly + Detection + + +
+ Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the +teacher-student paradigm to detect and segment anomalous regions by contrasting +the unique features extracted by both networks. However, existing KDAD methods +suffer from two main limitations: 1) the student network can effortlessly +replicate the teacher network's representations, and 2) the features of the +teacher network serve solely as a ``reference standard" and are not fully +leveraged. Toward this end, we depart from the established paradigm and instead +propose an innovative approach called Asymmetric Distillation Post-Segmentation +(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes +distinct forms of the same image as the input of the teacher-student networks, +driving the student network to learn discriminating representations for +anomalous regions. + Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a +coarse anomaly localization mask that transfers the distilled knowledge +acquired from the asymmetric paradigm to the teacher network. Equipped with +WMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect +and segment abnormal regions with fine structures and clear boundaries. +Experimental results demonstrate that the proposed ADPS outperforms the +state-of-the-art methods in detecting and segmenting anomalies. Surprisingly, +ADPS significantly improves Average Precision (AP) metric by 9% and 20% on the +MVTec AD and KolektorSDD2 datasets, respectively. + +
+
+ comment: 11pages,9 figures +
+
+
+
+
+
+
+
+ + Information Retrieval 12 + +
+
+
+ + ☆ Investigating the Factual Knowledge Boundary of Large Language Models + with Retrieval Augmentation + + +
+ Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require +a substantial amount of factual knowledge and often rely on external +information for assistance. Recently, large language models (LLMs) (e.g., +ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks +with world knowledge, including knowledge-intensive tasks. However, it remains +unclear how well LLMs are able to perceive their factual knowledge boundaries, +particularly how they behave when incorporating retrieval augmentation. In this +study, we present an initial analysis of the factual knowledge boundaries of +LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially, +we focus on three primary research questions and analyze them by examining QA +performance, priori judgement and posteriori judgement of LLMs. We show +evidence that LLMs possess unwavering confidence in their capabilities to +respond to questions and the accuracy of their responses. Furthermore, +retrieval augmentation proves to be an effective approach in enhancing LLMs' +awareness of knowledge boundaries, thereby improving their judgemental +abilities. Additionally, we also find that LLMs have a propensity to rely on +the provided retrieval results when formulating answers, while the quality of +these results significantly impacts their reliance. The code to reproduce this +work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary. + +
+
+
+
+
+ + ☆ Enhancing Job Recommendation through LLM-based Generative Adversarial + Networks + + +
+ Recommending suitable jobs to users is a critical task in online recruitment +platforms, as it can enhance users' satisfaction and the platforms' +profitability. While existing job recommendation methods encounter challenges +such as the low quality of users' resumes, which hampers their accuracy and +practical effectiveness. With the rapid development of large language models +(LLMs), utilizing the rich external knowledge encapsulated within them, as well +as their powerful capabilities of text processing and reasoning, is a promising +way to complete users' resumes for more accurate recommendations. However, +directly leveraging LLMs to enhance recommendation results is not a +one-size-fits-all solution, as LLMs may suffer from fabricated generation and +few-shot problems, which degrade the quality of resume completion. In this +paper, we propose a novel LLM-based approach for job recommendation. To +alleviate the limitation of fabricated generation for LLMs, we extract accurate +and valuable information beyond users' self-description, which helps the LLMs +better profile users for resume completion. Specifically, we not only extract +users' explicit properties (e.g., skills, interests) from their +self-description but also infer users' implicit characteristics from their +behaviors for more accurate and meaningful resume completion. Nevertheless, +some users still suffer from few-shot problems, which arise due to scarce +interaction records, leading to limited guidance for the models in generating +high-quality resumes. To address this issue, we propose aligning unpaired +low-quality with high-quality generated resumes by Generative Adversarial +Networks (GANs), which can refine the resume representations for better +recommendation results. Extensive experiments on three large real-world +recruitment datasets demonstrate the effectiveness of our proposed method. + +
+
+ comment: 13 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ A Constraint-based Recommender System via RDF Knowledge Graphs + + +
+ Knowledge graphs, represented in RDF, are able to model entities and their +relations by means of ontologies. The use of knowledge graphs for information +modeling has attracted interest in recent years. In recommender systems, items +and users can be mapped and integrated into the knowledge graph, which can +represent more links and relationships between users and items. +Constraint-based recommender systems are based on the idea of explicitly +exploiting deep recommendation knowledge through constraints to identify +relevant recommendations. When combined with knowledge graphs, a +constraint-based recommender system gains several benefits in terms of +constraint sets. In this paper, we investigate and propose the construction of +a constraint-based recommender system via RDF knowledge graphs applied to the +vehicle purchase/sale domain. The results of our experiments show that the +proposed approach is able to efficiently identify recommendations in accordance +with user preferences. + +
+
+
+
+
+ + ☆ A Personalized Recommender System Based-on Knowledge Graph Embeddings + + +
+ Knowledge graphs have proven to be effective for modeling entities and their +relationships through the use of ontologies. The recent emergence in interest +for using knowledge graphs as a form of information modeling has led to their +increased adoption in recommender systems. By incorporating users and items +into the knowledge graph, these systems can better capture the implicit +connections between them and provide more accurate recommendations. In this +paper, we investigate and propose the construction of a personalized +recommender system via knowledge graphs embedding applied to the vehicle +purchase/sale domain. The results of our experimentation demonstrate the +efficacy of the proposed method in providing relevant recommendations that are +consistent with individual users. + +
+
+
+
+
+ + ☆ Language-Enhanced Session-Based Recommendation with Decoupled + Contrastive Learning + + +
+ Session-based recommendation techniques aim to capture dynamic user behavior +by analyzing past interactions. However, existing methods heavily rely on +historical item ID sequences to extract user preferences, leading to challenges +such as popular bias and cold-start problems. In this paper, we propose a +hybrid multimodal approach for session-based recommendation to address these +challenges. Our approach combines different modalities, including textual +content and item IDs, leveraging the complementary nature of these modalities +using CatBoost. To learn universal item representations, we design a language +representation-based item retrieval architecture that extracts features from +the textual content utilizing pre-trained language models. Furthermore, we +introduce a novel Decoupled Contrastive Learning method to enhance the +effectiveness of the language representation. This technique decouples the +sequence representation and item representation space, facilitating +bidirectional alignment through dual-queue contrastive learning. +Simultaneously, the momentum queue provides a large number of negative samples, +effectively enhancing the effectiveness of contrastive learning. Our approach +yielded competitive results, securing a 5th place ranking in KDD CUP 2023 Task +1. We have released the source code and pre-trained models associated with this +work. + +
+
+
+
+
+ + ☆ Improving Semantic Similarity Measure Within a Recommender System + Based-on RDF Graphs + + +
+ In today's era of information explosion, more users are becoming more reliant +upon recommender systems to have better advice, suggestions, or inspire them. +The measure of the semantic relatedness or likeness between terms, words, or +text data plays an important role in different applications dealing with +textual data, as in a recommender system. Over the past few years, many +ontologies have been developed and used as a form of structured representation +of knowledge bases for information systems. The measure of semantic similarity +from ontology has developed by several methods. In this paper, we propose and +carry on an approach for the improvement of semantic similarity calculations +within a recommender system based-on RDF graphs. + +
+
+
+
+
+ + ☆ Detecting deceptive reviews using text classification + + +
+ In recent years, online reviews play a vital role for promoting any kind of +product or services. Businesses may embed fake reviews in order to attract +customers to purchase their products. They may even highlight the benefits of +their own product or criticize the competition's product. Marketers, +advertisers, and other online business users have incentive to create fake +positive reviews for products which they want to promote or give fake negative +reviews for products which they really don't like. So now-a-days writing a +deceptive review is inevitable thing for promoting their own business or +degrading competitor's reputation. Thus, identifying deceptive reviews is an +intense and on-going research area. This research paper proposes machine +learning model approach to identify deceptive reviews. The paper investigates +the performance of the several experiments done on a Deceptive Opinion Spam +Corpus dataset of restaurants reviews. We developed a n-gram model and max +features to identify deceptive contents with a particular focus on fake +reviews. Further, we conduct a benchmark study to investigate the performance +of two different features extraction techniques and apply five machine learning +classification techniques. The experimental results show that passive +aggressive classifier outperforms other algorithms, and it reaches the highest +accuracy not only in text classification but also to fake reviews. We also +study the data augmentation and implement different deep learning techniques. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding + Models EMNLP 2023 + + +
+ Jina Embeddings constitutes a set of high-performance sentence embedding +models adept at translating various textual inputs into numerical +representations, thereby capturing the semantic essence of the text. While +these models are not exclusively designed for text generation, they excel in +applications such as dense retrieval and semantic textual similarity. This +paper details the development of Jina Embeddings, starting with the creation of +a high-quality pairwise and triplet dataset. It underlines the crucial role of +data cleaning in dataset preparation, gives in-depth insights into the model +training process, and concludes with a comprehensive performance evaluation +using the Massive Textual Embedding Benchmark (MTEB). + +
+
+ comment: 9 pages, 2 page appendix, EMNLP 2023 Industrial Track +
+
+
+
+
+ + ☆ RCVaR: an Economic Approach to Estimate Cyberattacks Costs using Data + from Industry Reports + + +
+ Digitization increases business opportunities and the risk of companies being +victims of devastating cyberattacks. Therefore, managing risk exposure and +cybersecurity strategies is essential for digitized companies that want to +survive in competitive markets. However, understanding company-specific risks +and quantifying their associated costs is not trivial. Current approaches fail +to provide individualized and quantitative monetary estimations of +cybersecurity impacts. Due to limited resources and technical expertise, SMEs +and even large companies are affected and struggle to quantify their +cyberattack exposure. Therefore, novel approaches must be placed to support the +understanding of the financial loss due to cyberattacks. This article +introduces the Real Cyber Value at Risk (RCVaR), an economical approach for +estimating cybersecurity costs using real-world information from public +cybersecurity reports. RCVaR identifies the most significant cyber risk factors +from various sources and combines their quantitative results to estimate +specific cyberattacks costs for companies. Furthermore, RCVaR extends current +methods to achieve cost and risk estimations based on historical real-world +data instead of only probability-based simulations. The evaluation of the +approach on unseen data shows the accuracy and efficiency of the RCVaR in +predicting and managing cyber risks. Thus, it shows that the RCVaR is a +valuable addition to cybersecurity planning and risk management processes. + +
+
+
+
+
+ + ♻ ☆ Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior + Implicit Recommendation KDD + + +
+ Recent years have witnessed the explosive growth of interaction behaviors in +multimedia information systems, where multi-behavior recommender systems have +received increasing attention by leveraging data from various auxiliary +behaviors such as tip and collect. Among various multi-behavior recommendation +methods, non-sampling methods have shown superiority over negative sampling +methods. However, two observations are usually ignored in existing +state-of-the-art non-sampling methods based on binary regression: (1) users +have different preference strengths for different items, so they cannot be +measured simply by binary implicit data; (2) the dependency across multiple +behaviors varies for different users and items. To tackle the above issue, we +propose a novel non-sampling learning framework named Criterion-guided +Heterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and +lower thresholds to indicate selection criteria, which will guide user +preference learning. Besides, CHCF integrates criterion learning and user +preference learning into a unified framework, which can be trained jointly for +the interaction prediction of the target behavior. We further theoretically +demonstrate that the optimization of Collaborative Metric Learning can be +approximately achieved by the CHCF learning framework in a non-sampling form +effectively. Extensive experiments on three real-world datasets show the +effectiveness of CHCF in heterogeneous scenarios. + +
+
+ comment: Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD) +
+
+
+
+
+ + ♻ ☆ ABNIRML: Analyzing the Behavior of Neural IR Models ACL + + +
+ Pretrained contextualized language models such as BERT and T5 have +established a new state-of-the-art for ad-hoc search. However, it is not yet +well-understood why these methods are so effective, what makes some variants +more effective than others, and what pitfalls they may have. We present a new +comprehensive framework for Analyzing the Behavior of Neural IR ModeLs +(ABNIRML), which includes new types of diagnostic probes that allow us to test +several characteristics -- such as writing styles, factuality, sensitivity to +paraphrasing and word order -- that are not addressed by previous techniques. +To demonstrate the value of the framework, we conduct an extensive empirical +study that yields insights into the factors that contribute to the neural +model's gains, and identify potential unintended biases the models exhibit. +Some of our results confirm conventional wisdom, like that recent neural +ranking models rely less on exact term overlap with the query, and instead +leverage richer linguistic information, evidenced by their higher sensitivity +to word and sentence order. Other results are more surprising, such as that +some models (e.g., T5 and ColBERT) are biased towards factually correct (rather +than simply relevant) texts. Further, some characteristics vary even for the +same base language model, and other characteristics can appear due to random +variations during model training. + +
+
+ comment: TACL version +
+
+
+
+
+ + ♻ ☆ ChatGPT Chemistry Assistant for Text Mining and Prediction of MOF + Synthesis + + +
+ We use prompt engineering to guide ChatGPT in the automation of text mining +of metal-organic frameworks (MOFs) synthesis conditions from diverse formats +and styles of the scientific literature. This effectively mitigates ChatGPT's +tendency to hallucinate information -- an issue that previously made the use of +Large Language Models (LLMs) in scientific fields challenging. Our approach +involves the development of a workflow implementing three different processes +for text mining, programmed by ChatGPT itself. All of them enable parsing, +searching, filtering, classification, summarization, and data unification with +different tradeoffs between labor, speed, and accuracy. We deploy this system +to extract 26,257 distinct synthesis parameters pertaining to approximately 800 +MOFs sourced from peer-reviewed research articles. This process incorporates +our ChemPrompt Engineering strategy to instruct ChatGPT in text mining, +resulting in impressive precision, recall, and F1 scores of 90-99%. +Furthermore, with the dataset built by text mining, we constructed a +machine-learning model with over 86% accuracy in predicting MOF experimental +crystallization outcomes and preliminarily identifying important factors in MOF +crystallization. We also developed a reliable data-grounded MOF chatbot to +answer questions on chemical reactions and synthesis procedures. Given that the +process of using ChatGPT reliably mines and tabulates diverse MOF synthesis +information in a unified format, while using only narrative language requiring +no coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be +very useful across various other chemistry sub-disciplines. + +
+
+ comment: Published on Journal of the American Chemical Society (2023); 102 + pages (18-page manuscript, 84 pages of supporting information) +
+
+
+
+
+
+
+
+ + Machine Learning 170 + +
+
+
+ + ☆ Data-driven criteria for quantum correlations + + +
+ We build a machine learning model to detect correlations in a three-qubit +system using a neural network trained in an unsupervised manner on randomly +generated states. The network is forced to recognize separable states, and +correlated states are detected as anomalies. Quite surprisingly, we find that +the proposed detector performs much better at distinguishing a weaker form of +quantum correlations, namely, the quantum discord, than entanglement. In fact, +it has a tendency to grossly overestimate the set of entangled states even at +the optimal threshold for entanglement detection, while it underestimates the +set of discordant states to a much lesser extent. In order to illustrate the +nature of states classified as quantum-correlated, we construct a diagram +containing various types of states -- entangled, as well as separable, both +discordant and non-discordant. We find that the near-zero value of the +recognition loss reproduces the shape of the non-discordant separable states +with high accuracy, especially considering the non-trivial shape of this set on +the diagram. The network architecture is designed carefully: it preserves +separability, and its output is equivariant with respect to qubit permutations. +We show that the choice of architecture is important to get the highest +detection accuracy, much better than for a baseline model that just utilizes a +partial trace operation. + +
+
+ comment: 7 pages, 3 figures, 3 tables, and extra 5 pages of supplementary + materials +
+
+
+
+
+ + ☆ PAPR: Proximity Attention Point Rendering + + +
+ Learning accurate and parsimonious point cloud representations of scene +surfaces from scratch remains a challenge in 3D representation learning. +Existing point-based methods often suffer from the vanishing gradient problem +or require a large number of points to accurately model scene geometry and +texture. To address these limitations, we propose Proximity Attention Point +Rendering (PAPR), a novel method that consists of a point-based scene +representation and a differentiable renderer. Our scene representation uses a +point cloud where each point is characterized by its spatial position, +foreground score, and view-independent feature vector. The renderer selects the +relevant points for each ray and produces accurate colours using their +associated features. PAPR effectively learns point cloud positions to represent +the correct scene geometry, even when the initialization drastically differs +from the target geometry. Notably, our method captures fine texture details +while using only a parsimonious set of points. We also demonstrate four +practical applications of our method: geometry editing, object manipulation, +texture transfer, and exposure control. More results and code are available on +our project website at https://zvict.github.io/papr/. + +
+
+
+
+
+ + ☆ Representation Learning in Anomaly Detection: Successes, Limits and a + Grand Challenge CVPR'23 + + +
+ In this perspective paper, we argue that the dominant paradigm in anomaly +detection cannot scale indefinitely and will eventually hit fundamental limits. +This is due to the a no free lunch principle for anomaly detection. These +limitations can be overcome when there are strong tasks priors, as is the case +for many industrial tasks. When such priors do not exists, the task is much +harder for anomaly detection. We pose two such tasks as grand challenges for +anomaly detection: i) scientific discovery by anomaly detection ii) a +"mini-grand" challenge of detecting the most anomalous image in the ImageNet +dataset. We believe new anomaly detection tools and ideas would need to be +developed to overcome these challenges. + +
+
+ comment: Keynote talk at the Visual Anomaly and Novelty Detection Workshop, + CVPR'23 +
+
+
+
+
+ + ☆ GLSFormer : Gated - Long, Short Sequence Transformer for Step + Recognition in Surgical Videos MICCAI 2023 + + +
+ Automated surgical step recognition is an important task that can +significantly improve patient safety and decision-making during surgeries. +Existing state-of-the-art methods for surgical step recognition either rely on +separate, multi-stage modeling of spatial and temporal information or operate +on short-range temporal resolution when learned jointly. However, the benefits +of joint modeling of spatio-temporal features and long-range information are +not taken in account. In this paper, we propose a vision transformer-based +approach to jointly learn spatio-temporal features directly from sequence of +frame-level patches. Our method incorporates a gated-temporal attention +mechanism that intelligently combines short-term and long-term spatio-temporal +feature representations. We extensively evaluate our approach on two cataract +surgery video datasets, namely Cataract-101 and D99, and demonstrate superior +performance compared to various state-of-the-art methods. These results +validate the suitability of our proposed approach for automated surgical step +recognition. Our code is released at: +https://github.com/nisargshah1999/GLSFormer + +
+
+ comment: Accepted to MICCAI 2023 (Early Accept) +
+
+
+
+
+ + ☆ Brain2Music: Reconstructing Music from Human Brain Activity + + +
+ The process of reconstructing experiences from human brain activity offers a +unique lens into how the brain interprets and represents the world. In this +paper, we introduce a method for reconstructing music from brain activity, +captured using functional magnetic resonance imaging (fMRI). Our approach uses +either music retrieval or the MusicLM music generation model conditioned on +embeddings derived from fMRI data. The generated music resembles the musical +stimuli that human subjects experienced, with respect to semantic properties +like genre, instrumentation, and mood. We investigate the relationship between +different components of MusicLM and brain activity through a voxel-wise +encoding modeling analysis. Furthermore, we discuss which brain regions +represent information derived from purely textual descriptions of music +stimuli. We provide supplementary material including examples of the +reconstructed music at https://google-research.github.io/seanet/brain2music + +
+
+ comment: Preprint; 21 pages; supplementary material: + https://google-research.github.io/seanet/brain2music +
+
+
+
+
+ + ☆ AlignDet: Aligning Pre-training and Fine-tuning in Object Detection ICCV 2023 + + +
+ The paradigm of large-scale pre-training followed by downstream fine-tuning +has been widely employed in various object detection algorithms. In this paper, +we reveal discrepancies in data, model, and task between the pre-training and +fine-tuning procedure in existing practices, which implicitly limit the +detector's performance, generalization ability, and convergence speed. To this +end, we propose AlignDet, a unified pre-training framework that can be adapted +to various existing detectors to alleviate the discrepancies. AlignDet +decouples the pre-training process into two stages, i.e., image-domain and +box-domain pre-training. The image-domain pre-training optimizes the detection +backbone to capture holistic visual abstraction, and box-domain pre-training +learns instance-level semantics and task-aware concepts to initialize the parts +out of the backbone. By incorporating the self-supervised pre-trained +backbones, we can pre-train all modules for various detectors in an +unsupervised paradigm. As depicted in Figure 1, extensive experiments +demonstrate that AlignDet can achieve significant improvements across diverse +protocols, such as detection algorithm, model backbone, data setting, and +training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by +2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs. + +
+
+ comment: Accepted by ICCV 2023. Code and Models are publicly available. + Project Page: https://liming-ai.github.io/AlignDet +
+
+
+
+
+ + ☆ Effectiveness and predictability of in-network storage cache for + scientific workflows + + +
+ Large scientific collaborations often have multiple scientists accessing the +same set of files while doing different analyses, which create repeated +accesses to the large amounts of shared data located far away. These data +accesses have long latency due to distance and occupy the limited bandwidth +available over the wide-area network. To reduce the wide-area network traffic +and the data access latency, regional data storage caches have been installed +as a new networking service. To study the effectiveness of such a cache system +in scientific applications, we examine the Southern California Petabyte Scale +Cache for a high-energy physics experiment. By examining about 3TB of +operational logs, we show that this cache removed 67.6% of file requests from +the wide-area network and reduced the traffic volume on wide-area network by +12.3TB (or 35.4%) an average day. The reduction in the traffic volume (35.4%) +is less than the reduction in file counts (67.6%) because the larger files are +less likely to be reused. Due to this difference in data access patterns, the +cache system has implemented a policy to avoid evicting smaller files when +processing larger files. We also build a machine learning model to study the +predictability of the cache behavior. Tests show that this model is able to +accurately predict the cache accesses, cache misses, and network throughput, +making the model useful for future studies on resource provisioning and +planning. + +
+
+
+
+
+ + ☆ Breadcrumbs to the Goal: Goal-Conditioned Exploration from + Human-in-the-Loop Feedback + + +
+ Exploration and reward specification are fundamental and intertwined +challenges for reinforcement learning. Solving sequential decision-making tasks +requiring expansive exploration requires either careful design of reward +functions or the use of novelty-seeking exploration bonuses. Human supervisors +can provide effective guidance in the loop to direct the exploration process, +but prior methods to leverage this guidance require constant synchronous +high-quality human feedback, which is expensive and impractical to obtain. In +this work, we present a technique called Human Guided Exploration (HuGE), which +uses low-quality feedback from non-expert users that may be sporadic, +asynchronous, and noisy. HuGE guides exploration for reinforcement learning not +only in simulation but also in the real world, all without meticulous reward +specification. The key concept involves bifurcating human feedback and policy +learning: human feedback steers exploration, while self-supervised learning +from the exploration data yields unbiased policies. This procedure can leverage +noisy, asynchronous human feedback to learn policies with no hand-crafted +reward design or exploration bonuses. HuGE is able to learn a variety of +challenging multi-stage robotic navigation and manipulation tasks in simulation +using crowdsourced feedback from non-expert users. Moreover, this paradigm can +be scaled to learning directly on real-world robots, using occasional, +asynchronous feedback from human supervisors. + +
+
+
+
+
+ + ☆ A Definition of Continual Reinforcement Learning + + +
+ In this paper we develop a foundation for continual reinforcement learning. + +
+
+
+
+
+ + ☆ On the Convergence of Bounded Agents + + +
+ When has an agent converged? Standard models of the reinforcement learning +problem give rise to a straightforward definition of convergence: An agent +converges when its behavior or performance in each environment state stops +changing. However, as we shift the focus of our learning problem from the +environment's state to the agent's state, the concept of an agent's convergence +becomes significantly less clear. In this paper, we propose two complementary +accounts of agent convergence in a framing of the reinforcement learning +problem that centers around bounded agents. The first view says that a bounded +agent has converged when the minimal number of states needed to describe the +agent's future behavior cannot decrease. The second view says that a bounded +agent has converged just when the agent's performance only changes if the +agent's internal state changes. We establish basic properties of these two +definitions, show that they accommodate typical views of convergence in +standard settings, and prove several facts about their nature and relationship. +We take these perspectives, definitions, and analysis to bring clarity to a +central idea of the field. + +
+
+
+
+
+ + ☆ Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot + Classification + + +
+ Recent work has shown that language models' (LMs) prompt-based learning +capabilities make them well suited for automating data labeling in domains +where manual annotation is expensive. The challenge is that while writing an +initial prompt is cheap, improving a prompt is costly -- practitioners often +require significant labeled data in order to evaluate the impact of prompt +modifications. Our work asks whether it is possible to improve prompt-based +learning without additional labeled data. We approach this problem by +attempting to modify the predictions of a prompt, rather than the prompt +itself. Our intuition is that accurate predictions should also be consistent: +samples which are similar under some feature representation should receive the +same prompt prediction. We propose Embroid, a method which computes multiple +representations of a dataset under different embedding functions, and uses the +consistency between the LM predictions for neighboring samples to identify +mispredictions. Embroid then uses these neighborhoods to create additional +predictions for each sample, and combines these predictions with a simple +latent variable graphical model in order to generate a final corrected +prediction. In addition to providing a theoretical analysis of Embroid, we +conduct a rigorous empirical evaluation across six different LMs and up to 95 +different tasks. We find that (1) Embroid substantially improves performance +over original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also +realizes improvements for more sophisticated prompting strategies (e.g., +chain-of-thought), and (3) can be specialized to domains like law through the +embedding functions. + +
+
+ comment: 38 pages, 22 figures, 8 tables +
+
+
+
+
+ + ☆ Cluster-aware Semi-supervised Learning: Relational Knowledge + Distillation Provably Learns Clustering + + +
+ Despite the empirical success and practical significance of (relational) +knowledge distillation that matches (the relations of) features between teacher +and student models, the corresponding theoretical interpretations remain +limited for various knowledge distillation paradigms. In this work, we take an +initial step toward a theoretical understanding of relational knowledge +distillation (RKD), with a focus on semi-supervised classification problems. We +start by casting RKD as spectral clustering on a population-induced graph +unveiled by a teacher model. Via a notion of clustering error that quantifies +the discrepancy between the predicted and ground truth clusterings, we +illustrate that RKD over the population provably leads to low clustering error. +Moreover, we provide a sample complexity bound for RKD with limited unlabeled +samples. For semi-supervised learning, we further demonstrate the label +efficiency of RKD through a general framework of cluster-aware semi-supervised +learning that assumes low clustering errors. Finally, by unifying data +augmentation consistency regularization into this cluster-aware framework, we +show that despite the common effect of learning accurate clusterings, RKD +facilitates a "global" perspective through spectral clustering, whereas +consistency regularization focuses on a "local" perspective via expansion. + +
+
+
+
+
+ + ☆ Amortized Variational Inference: When and Why? + + +
+ Amortized variational inference (A-VI) is a method for approximating the +intractable posterior distributions that arise in probabilistic models. The +defining feature of A-VI is that it learns a global inference function that +maps each observation to its local latent variable's approximate posterior. +This stands in contrast to the more classical factorized (or mean-field) +variational inference (F-VI), which directly learns the parameters of the +approximating distribution for each latent variable. In deep generative models, +A-VI is used as a computational trick to speed up inference for local latent +variables. In this paper, we study A-VI as a general alternative to F-VI for +approximate posterior inference. A-VI cannot produce an approximation with a +lower Kullback-Leibler divergence than F-VI's optimal solution, because the +amortized family is a subset of the factorized family. Thus a central +theoretical problem is to characterize when A-VI still attains F-VI's optimal +solution. We derive conditions on both the model and the inference function +under which A-VI can theoretically achieve F-VI's optimum. We show that for a +broad class of hierarchical models, including deep generative models, it is +possible to close the gap between A-VI and F-VI. Further, for an even broader +class of models, we establish when and how to expand the domain of the +inference function to make amortization a feasible strategy. Finally, we prove +that for certain models -- including hidden Markov models and Gaussian +processes -- A-VI cannot match F-VI's solution, no matter how expressive the +inference function is. We also study A-VI empirically [...] + +
+
+
+
+
+ + ☆ Multi-objective point cloud autoencoders for explainable myocardial + infarction prediction + + +
+ Myocardial infarction (MI) is one of the most common causes of death in the +world. Image-based biomarkers commonly used in the clinic, such as ejection +fraction, fail to capture more complex patterns in the heart's 3D anatomy and +thus limit diagnostic accuracy. In this work, we present the multi-objective +point cloud autoencoder as a novel geometric deep learning approach for +explainable infarction prediction, based on multi-class 3D point cloud +representations of cardiac anatomy and function. Its architecture consists of +multiple task-specific branches connected by a low-dimensional latent space to +allow for effective multi-objective learning of both reconstruction and MI +prediction, while capturing pathology-specific 3D shape information in an +interpretable latent space. Furthermore, its hierarchical branch design with +point cloud-based deep learning operations enables efficient multi-scale +feature learning directly on high-resolution anatomy point clouds. In our +experiments on a large UK Biobank dataset, the multi-objective point cloud +autoencoder is able to accurately reconstruct multi-temporal 3D shapes with +Chamfer distances between predicted and input anatomies below the underlying +images' pixel resolution. Our method outperforms multiple machine learning and +deep learning benchmarks for the task of incident MI prediction by 19% in terms +of Area Under the Receiver Operating Characteristic curve. In addition, its +task-specific compact latent space exhibits easily separable control and MI +clusters with clinically plausible associations between subject encodings and +corresponding 3D shapes, thus demonstrating the explainability of the +prediction. + +
+
+
+
+
+ + ☆ Flow Map Learning for Unknown Dynamical Systems: Overview, + Implementation, and Benchmarks + + +
+ Flow map learning (FML), in conjunction with deep neural networks (DNNs), has +shown promises for data driven modeling of unknown dynamical systems. A +remarkable feature of FML is that it is capable of producing accurate +predictive models for partially observed systems, even when their exact +mathematical models do not exist. In this paper, we present an overview of the +FML framework, along with the important computational details for its +successful implementation. We also present a set of well defined benchmark +problems for learning unknown dynamical systems. All the numerical details of +these problems are presented, along with their FML results, to ensure that the +problems are accessible for cross-examination and the results are reproducible. + +
+
+
+
+
+ + ☆ Neuron Sensitivity Guided Test Case Selection for Deep Learning Testing + + +
+ Deep Neural Networks~(DNNs) have been widely deployed in software to address +various tasks~(e.g., autonomous driving, medical diagnosis). However, they +could also produce incorrect behaviors that result in financial losses and even +threaten human safety. To reveal the incorrect behaviors in DNN and repair +them, DNN developers often collect rich unlabeled datasets from the natural +world and label them to test the DNN models. However, properly labeling a large +number of unlabeled datasets is a highly expensive and time-consuming task. + To address the above-mentioned problem, we propose NSS, Neuron Sensitivity +guided test case Selection, which can reduce the labeling time by selecting +valuable test cases from unlabeled datasets. NSS leverages the internal +neuron's information induced by test cases to select valuable test cases, which +have high confidence in causing the model to behave incorrectly. We evaluate +NSS with four widely used datasets and four well-designed DNN models compared +to SOTA baseline methods. The results show that NSS performs well in assessing +the test cases' probability of fault triggering and model improvement +capabilities. Specifically, compared with baseline approaches, NSS obtains a +higher fault detection rate~(e.g., when selecting 5\% test case from the +unlabeled dataset in MNIST \& LeNet1 experiment, NSS can obtain 81.8\% fault +detection rate, 20\% higher than baselines). + +
+
+
+
+
+ + ☆ Sharpness Minimization Algorithms Do Not Only Minimize Sharpness To + Achieve Better Generalization + + +
+ Despite extensive studies, the underlying reason as to why overparameterized +neural networks can generalize remains elusive. Existing theory shows that +common stochastic optimizers prefer flatter minimizers of the training loss, +and thus a natural potential explanation is that flatness implies +generalization. This work critically examines this explanation. Through +theoretical and empirical investigation, we identify the following three +scenarios for two-layer ReLU networks: (1) flatness provably implies +generalization; (2) there exist non-generalizing flattest models and sharpness +minimization algorithms fail to generalize, and (3) perhaps most surprisingly, +there exist non-generalizing flattest models, but sharpness minimization +algorithms still generalize. Our results suggest that the relationship between +sharpness and generalization subtly depends on the data distributions and the +model architectures and sharpness minimization algorithms do not only minimize +sharpness to achieve better generalization. This calls for the search for other +explanations for the generalization of over-parameterized neural networks. + +
+
+ comment: 34 pages,11 figures +
+
+
+
+
+ + ☆ Private Federated Learning with Autotuned Compression ICML 2023 + + +
+ We propose new techniques for reducing communication in private federated +learning without the need for setting or tuning compression rates. Our +on-the-fly methods automatically adjust the compression rate based on the error +induced during training, while maintaining provable privacy guarantees through +the use of secure aggregation and differential privacy. Our techniques are +provably instance-optimal for mean estimation, meaning that they can adapt to +the ``hardness of the problem" with minimal interactivity. We demonstrate the +effectiveness of our approach on real-world datasets by achieving favorable +compression rates without the need for tuning. + +
+
+ comment: Accepted to ICML 2023 +
+
+
+
+
+ + ☆ DREAM: Domain-free Reverse Engineering Attributes of Black-box Model + + +
+ Deep learning models are usually black boxes when deployed on machine +learning platforms. Prior works have shown that the attributes ($e.g.$, the +number of convolutional layers) of a target black-box neural network can be +exposed through a sequence of queries. There is a crucial limitation: these +works assume the dataset used for training the target model to be known +beforehand and leverage this dataset for model attribute attack. However, it is +difficult to access the training dataset of the target black-box model in +reality. Therefore, whether the attributes of a target black-box model could be +still revealed in this case is doubtful. In this paper, we investigate a new +problem of Domain-agnostic Reverse Engineering the Attributes of a black-box +target Model, called DREAM, without requiring the availability of the target +model's training dataset, and put forward a general and principled framework by +casting this problem as an out of distribution (OOD) generalization problem. In +this way, we can learn a domain-agnostic model to inversely infer the +attributes of a target black-box model with unknown training data. This makes +our method one of the kinds that can gracefully apply to an arbitrary domain +for model attribute reverse engineering with strong generalization ability. +Extensive experimental studies are conducted and the results validate the +superiority of our proposed method over the baselines. + +
+
+
+
+
+ + ☆ Progressive distillation diffusion for raw music generation + + +
+ This paper aims to apply a new deep learning approach to the task of +generating raw audio files. It is based on diffusion models, a recent type of +deep generative model. This new type of method has recently shown outstanding +results with image generation. A lot of focus has been given to those models by +the computer vision community. On the other hand, really few have been given +for other types of applications such as music generation in waveform domain. + In this paper the model for unconditional generating applied to music is +implemented: Progressive distillation diffusion with 1D U-Net. Then, a +comparison of different parameters of diffusion and their value in a full +result is presented. One big advantage of the methods implemented through this +work is the fact that the model is able to deal with progressing audio +processing and generating , using transformation from 1-channel 128 x 384 to +3-channel 128 x 128 mel-spectrograms and looped generation. The empirical +comparisons are realized across different self-collected datasets. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Investigating minimizing the training set fill distance in machine + learning regression + + +
+ Many machine learning regression methods leverage large datasets for training +predictive models. However, using large datasets may not be feasible due to +computational limitations or high labelling costs. Therefore, sampling small +training sets from large pools of unlabelled data points is essential to +maximize model performance while maintaining computational efficiency. In this +work, we study a sampling approach aimed to minimize the fill distance of the +selected set. We derive an upper bound for the maximum expected prediction +error that linearly depends on the training set fill distance, conditional to +the knowledge of data features. For empirical validation, we perform +experiments using two regression models on two datasets. We empirically show +that selecting a training set by aiming to minimize the fill distance, thereby +minimizing the bound, significantly reduces the maximum prediction error of +various regression models, outperforming existing sampling approaches by a +large margin. + +
+
+
+
+
+ + ☆ MASR: Metadata Aware Speech Representation + + +
+ In the recent years, speech representation learning is constructed primarily +as a self-supervised learning (SSL) task, using the raw audio signal alone, +while ignoring the side-information that is often available for a given speech +recording. In this paper, we propose MASR, a Metadata Aware Speech +Representation learning framework, which addresses the aforementioned +limitations. MASR enables the inclusion of multiple external knowledge sources +to enhance the utilization of meta-data information. The external knowledge +sources are incorporated in the form of sample-level pair-wise similarity +matrices that are useful in a hard-mining loss. A key advantage of the MASR +framework is that it can be combined with any choice of SSL method. Using MASR +representations, we perform evaluations on several downstream tasks such as +language identification, speech recognition and other non-semantic tasks such +as speaker and emotion recognition. In these experiments, we illustrate +significant performance improvements for the MASR over other established +benchmarks. We perform a detailed analysis on the language identification task +to provide insights on how the proposed loss function enables the +representations to separate closely related languages. + +
+
+
+
+
+ + ☆ PATROL: Privacy-Oriented Pruning for Collaborative Inference Against + Model Inversion Attacks + + +
+ Collaborative inference has been a promising solution to enable +resource-constrained edge devices to perform inference using state-of-the-art +deep neural networks (DNNs). In collaborative inference, the edge device first +feeds the input to a partial DNN locally and then uploads the intermediate +result to the cloud to complete the inference. However, recent research +indicates model inversion attacks (MIAs) can reconstruct input data from +intermediate results, posing serious privacy concerns for collaborative +inference. Existing perturbation and cryptography techniques are inefficient +and unreliable in defending against MIAs while performing accurate inference. +This paper provides a viable solution, named PATROL, which develops +privacy-oriented pruning to balance privacy, efficiency, and utility of +collaborative inference. PATROL takes advantage of the fact that later layers +in a DNN can extract more task-specific features. Given limited local resources +for collaborative inference, PATROL intends to deploy more layers at the edge +based on pruning techniques to enforce task-specific features for inference and +reduce task-irrelevant but sensitive features for privacy preservation. To +achieve privacy-oriented pruning, PATROL introduces two key components: +Lipschitz regularization and adversarial reconstruction training, which +increase the reconstruction errors by reducing the stability of MIAs and +enhance the target inference model by adversarial training, respectively. + +
+
+
+
+
+ + ☆ Globally Normalising the Transducer for Streaming Speech Recognition + + +
+ The Transducer (e.g. RNN-Transducer or Conformer-Transducer) generates an +output label sequence as it traverses the input sequence. It is straightforward +to use in streaming mode, where it generates partial hypotheses before the +complete input has been seen. This makes it popular in speech recognition. +However, in streaming mode the Transducer has a mathematical flaw which, simply +put, restricts the model's ability to change its mind. The fix is to replace +local normalisation (e.g. a softmax) with global normalisation, but then the +loss function becomes impossible to evaluate exactly. A recent paper proposes +to solve this by approximating the model, severely degrading performance. +Instead, this paper proposes to approximate the loss function, allowing global +normalisation to apply to a state-of-the-art streaming model. Global +normalisation reduces its word error rate by 9-11% relative, closing almost +half the gap between streaming and lookahead mode. + +
+
+ comment: 9 pages plus references and appendices +
+
+
+
+
+ + ☆ PASTA: Pretrained Action-State Transformer Agents + + +
+ Self-supervised learning has brought about a revolutionary paradigm shift in +various computing domains, including NLP, vision, and biology. Recent +approaches involve pre-training transformer models on vast amounts of unlabeled +data, serving as a starting point for efficiently solving downstream tasks. In +the realm of reinforcement learning, researchers have recently adapted these +approaches by developing models pre-trained on expert trajectories, enabling +them to address a wide range of tasks, from robotics to recommendation systems. +However, existing methods mostly rely on intricate pre-training objectives +tailored to specific downstream applications. This paper presents a +comprehensive investigation of models we refer to as Pretrained Action-State +Transformer Agents (PASTA). Our study uses a unified methodology and covers an +extensive set of general downstream tasks including behavioral cloning, offline +RL, sensor failure robustness, and dynamics change adaptation. Our goal is to +systematically compare various design choices and provide valuable insights to +practitioners for building robust models. Key highlights of our study include +tokenization at the action and state component level, using fundamental +pre-training objectives like next token prediction, training models across +diverse domains simultaneously, and using parameter efficient fine-tuning +(PEFT). The developed models in our study contain fewer than 10 million +parameters and the application of PEFT enables fine-tuning of fewer than 10,000 +parameters during downstream adaptation, allowing a broad community to use +these models and reproduce our experiments. We hope that this study will +encourage further research into the use of transformers with first-principles +design choices to represent RL trajectories and contribute to robust policy +learning. + +
+
+
+
+
+ + ☆ Inorganic synthesis-structure maps in zeolites with machine learning and + crystallographic distances + + +
+ Zeolites are inorganic materials known for their diversity of applications, +synthesis conditions, and resulting polymorphs. Although their synthesis is +controlled both by inorganic and organic synthesis conditions, computational +studies of zeolite synthesis have focused mostly on organic template design. In +this work, we use a strong distance metric between crystal structures and +machine learning (ML) to create inorganic synthesis maps in zeolites. Starting +with 253 known zeolites, we show how the continuous distances between +frameworks reproduce inorganic synthesis conditions from the literature without +using labels such as building units. An unsupervised learning analysis shows +that neighboring zeolites according to our metric often share similar inorganic +synthesis conditions, even in template-based routes. In combination with ML +classifiers, we find synthesis-structure relationships for 14 common inorganic +conditions in zeolites, namely Al, B, Be, Ca, Co, F, Ga, Ge, K, Mg, Na, P, Si, +and Zn. By explaining the model predictions, we demonstrate how +(dis)similarities towards known structures can be used as features for the +synthesis space. Finally, we show how these methods can be used to predict +inorganic synthesis conditions for unrealized frameworks in hypothetical +databases and interpret the outcomes by extracting local structural patterns +from zeolites. In combination with template design, this work can accelerate +the exploration of the space of synthesis conditions for zeolites. + +
+
+
+
+
+ + ☆ Modeling 3D cardiac contraction and relaxation with point cloud + deformation networks + + +
+ Global single-valued biomarkers of cardiac function typically used in +clinical practice, such as ejection fraction, provide limited insight on the +true 3D cardiac deformation process and hence, limit the understanding of both +healthy and pathological cardiac mechanics. In this work, we propose the Point +Cloud Deformation Network (PCD-Net) as a novel geometric deep learning approach +to model 3D cardiac contraction and relaxation between the extreme ends of the +cardiac cycle. It employs the recent advances in point cloud-based deep +learning into an encoder-decoder structure, in order to enable efficient +multi-scale feature learning directly on multi-class 3D point cloud +representations of the cardiac anatomy. We evaluate our approach on a large +dataset of over 10,000 cases from the UK Biobank study and find average Chamfer +distances between the predicted and ground truth anatomies below the pixel +resolution of the underlying image acquisition. Furthermore, we observe similar +clinical metrics between predicted and ground truth populations and show that +the PCD-Net can successfully capture subpopulation-specific differences between +normal subjects and myocardial infarction (MI) patients. We then demonstrate +that the learned 3D deformation patterns outperform multiple clinical +benchmarks by 13% and 7% in terms of area under the receiver operating +characteristic curve for the tasks of prevalent MI detection and incident MI +prediction and by 7% in terms of Harrell's concordance index for MI survival +analysis. + +
+
+
+
+
+ + ☆ Confidence intervals for performance estimates in 3D medical image + segmentation + + +
+ Medical segmentation models are evaluated empirically. As such an evaluation +is based on a limited set of example images, it is unavoidably noisy. Beyond a +mean performance measure, reporting confidence intervals is thus crucial. +However, this is rarely done in medical image segmentation. The width of the +confidence interval depends on the test set size and on the spread of the +performance measure (its standard-deviation across of the test set). For +classification, many test images are needed to avoid wide confidence intervals. +Segmentation, however, has not been studied, and it differs by the amount of +information brought by a given test image. In this paper, we study the typical +confidence intervals in medical image segmentation. We carry experiments on 3D +image segmentation using the standard nnU-net framework, two datasets from the +Medical Decathlon challenge and two performance measures: the Dice accuracy and +the Hausdorff distance. We show that the parametric confidence intervals are +reasonable approximations of the bootstrap estimates for varying test set sizes +and spread of the performance metric. Importantly, we show that the test size +needed to achieve a given precision is often much lower than for classification +tasks. Typically, a 1% wide confidence interval requires about 100-200 test +samples when the spread is low (standard-deviation around 3%). More difficult +segmentation tasks may lead to higher spreads and require over 1000 samples. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Sequential Multi-Dimensional Self-Supervised Learning for Clinical Time + Series ICML 2023 + + +
+ Self-supervised learning (SSL) for clinical time series data has received +significant attention in recent literature, since these data are highly rich +and provide important information about a patient's physiological state. +However, most existing SSL methods for clinical time series are limited in that +they are designed for unimodal time series, such as a sequence of structured +features (e.g., lab values and vitals signs) or an individual high-dimensional +physiological signal (e.g., an electrocardiogram). These existing methods +cannot be readily extended to model time series that exhibit multimodality, +with structured features and high-dimensional data being recorded at each +timestep in the sequence. In this work, we address this gap and propose a new +SSL method -- Sequential Multi-Dimensional SSL -- where a SSL loss is applied +both at the level of the entire sequence and at the level of the individual +high-dimensional data points in the sequence in order to better capture +information at both scales. Our strategy is agnostic to the specific form of +loss function used at each level -- it can be contrastive, as in SimCLR, or +non-contrastive, as in VICReg. We evaluate our method on two real-world +clinical datasets, where the time series contains sequences of (1) +high-frequency electrocardiograms and (2) structured data from lab values and +vitals signs. Our experimental results indicate that pre-training with our +method and then fine-tuning on downstream tasks improves performance over +baselines on both datasets, and in several settings, can lead to improvements +across different self-supervised loss functions. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ☆ Language-based Action Concept Spaces Improve Video Self-Supervised + Learning + + +
+ Recent contrastive language image pre-training has led to learning highly +transferable and robust image representations. However, adapting these models +to video domains with minimal supervision remains an open problem. We explore a +simple step in that direction, using language tied self-supervised learning to +adapt an image CLIP model to the video domain. A backbone modified for temporal +modeling is trained under self-distillation settings with train objectives +operating in an action concept space. Feature vectors of various action +concepts extracted from a language encoder using relevant textual prompts +construct this space. We introduce two train objectives, concept distillation +and concept alignment, that retain generality of original representations while +enforcing relations between actions and their attributes. Our approach improves +zero-shot and linear probing performance on three action recognition +benchmarks. + +
+
+
+
+
+ + ☆ The Role of Entropy and Reconstruction in Multi-View Self-Supervised + Learning ICML 2023 + + +
+ The mechanisms behind the success of multi-view self-supervised learning +(MVSSL) are not yet fully understood. Contrastive MVSSL methods have been +studied through the lens of InfoNCE, a lower bound of the Mutual Information +(MI). However, the relation between other MVSSL methods and MI remains unclear. +We consider a different lower bound on the MI consisting of an entropy and a +reconstruction term (ER), and analyze the main MVSSL families through its lens. +Through this ER bound, we show that clustering-based methods such as +DeepCluster and SwAV maximize the MI. We also re-interpret the mechanisms of +distillation-based approaches such as BYOL and DINO, showing that they +explicitly maximize the reconstruction term and implicitly encourage a stable +entropy, and we confirm this empirically. We show that replacing the objectives +of common MVSSL methods with this ER bound achieves competitive performance, +while making them stable when training with smaller batch sizes or smaller +exponential moving average (EMA) coefficients. + Github repo: https://github.com/apple/ml-entropy-reconstruction. + +
+
+ comment: 18 pages: 9 of main text, 2 of references, and 7 of supplementary + material. Appears in the proceedings of ICML 2023 +
+
+
+
+
+ + ☆ Variational Point Encoding Deformation for Dental Modeling + + +
+ Digital dentistry has made significant advancements in recent years, yet +numerous challenges remain to be addressed. In this study, we release a new +extensive dataset of tooth meshes to encourage further research. Additionally, +we propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable +probabilistic learning of point cloud representations. A key challenge in +existing latent variable models for point clouds is the lack of a 1-to-1 +mapping between input points and output points. Instead, they must rely on +optimizing Chamfer distances, a metric that does not have a normalized +distributional counterpart, preventing its usage in probabilistic models. We +demonstrate that explicit minimization of Chamfer distances can be replaced by +a suitable encoder, which allows us to increase computational efficiency while +simplifying the probabilistic extension. Our experimental findings present +empirical evidence demonstrating the superior performance of VF-Net over +existing models in terms of dental scan reconstruction and extrapolation. +Additionally, our investigation highlights the robustness of VF-Net's latent +representations. These results underscore the promising prospects of VF-Net as +an effective and reliable method for point cloud reconstruction and analysis. + +
+
+
+
+
+ + ☆ Learning and Generalizing Polynomials in Simulation Metamodeling + + +
+ The ability to learn polynomials and generalize out-of-distribution is +essential for simulation metamodels in many disciplines of engineering, where +the time step updates are described by polynomials. While feed forward neural +networks can fit any function, they cannot generalize out-of-distribution for +higher-order polynomials. Therefore, this paper collects and proposes +multiplicative neural network (MNN) architectures that are used as recursive +building blocks for approximating higher-order polynomials. Our experiments +show that MNNs are better than baseline models at generalizing, and their +performance in validation is true to their performance in out-of-distribution +tests. In addition to MNN architectures, a simulation metamodeling approach is +proposed for simulations with polynomial time step updates. For these +simulations, simulating a time interval can be performed in fewer steps by +increasing the step size, which entails approximating higher-order polynomials. +While our approach is compatible with any simulation with polynomial time step +updates, a demonstration is shown for an epidemiology simulation model, which +also shows the inductive bias in MNNs for learning and generalizing +higher-order polynomials. + +
+
+
+
+
+ + ☆ Syntactic vs Semantic Linear Abstraction and Refinement of Neural + Networks + + +
+ Abstraction is a key verification technique to improve scalability. However, +its use for neural networks is so far extremely limited. Previous approaches +for abstracting classification networks replace several neurons with one of +them that is similar enough. We can classify the similarity as defined either +syntactically (using quantities on the connections between neurons) or +semantically (on the activation values of neurons for various inputs). +Unfortunately, the previous approaches only achieve moderate reductions, when +implemented at all. In this work, we provide a more flexible framework where a +neuron can be replaced with a linear combination of other neurons, improving +the reduction. We apply this approach both on syntactic and semantic +abstractions, and implement and evaluate them experimentally. Further, we +introduce a refinement method for our abstractions, allowing for finding a +better balance between reduction and precision. + +
+
+ comment: Accepted at ATVA 2023 +
+
+
+
+
+ + ☆ Player-optimal Stable Regret for Bandit Learning in Matching Markets + + +
+ The problem of matching markets has been studied for a long time in the +literature due to its wide range of applications. Finding a stable matching is +a common equilibrium objective in this problem. Since market participants are +usually uncertain of their preferences, a rich line of recent works study the +online setting where one-side participants (players) learn their unknown +preferences from iterative interactions with the other side (arms). Most +previous works in this line are only able to derive theoretical guarantees for +player-pessimal stable regret, which is defined compared with the players' +least-preferred stable matching. However, under the pessimal stable matching, +players only obtain the least reward among all stable matchings. To maximize +players' profits, player-optimal stable matching would be the most desirable. +Though \citet{basu21beyond} successfully bring an upper bound for +player-optimal stable regret, their result can be exponentially large if +players' preference gap is small. Whether a polynomial guarantee for this +regret exists is a significant but still open problem. In this work, we provide +a new algorithm named explore-then-Gale-Shapley (ETGS) and show that the +optimal stable regret of each player can be upper bounded by $O(K\log +T/\Delta^2)$ where $K$ is the number of arms, $T$ is the horizon and $\Delta$ +is the players' minimum preference gap among the first $N+1$-ranked arms. This +result significantly improves previous works which either have a weaker +player-pessimal stable matching objective or apply only to markets with special +assumptions. When the preferences of participants satisfy some special +conditions, our regret upper bound also matches the previously derived lower +bound. + +
+
+ comment: SODA 2023 +
+
+
+
+
+ + ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification + + +
+ The popularity of point cloud deep models for safety-critical purposes has +increased, but the reliability and security of these models can be compromised +by intentional or naturally occurring point cloud noise. To combat this issue, +we present a novel point cloud outlier removal method called PointCVaR, which +empowers standard-trained models to eliminate additional outliers and restore +the data. Our approach begins by conducting attribution analysis to determine +the influence of each point on the model output, which we refer to as point +risk. We then optimize the process of filtering high-risk points using +Conditional Value at Risk (CVaR) as the objective. The rationale for this +approach is based on the observation that noise points in point clouds tend to +cluster in the tail of the risk distribution, with a low frequency but a high +level of risk, resulting in significant interference with classification +results. Despite requiring no additional training effort, our method produces +exceptional results in various removal-and-classification experiments for noisy +point clouds, which are corrupted by random noise, adversarial noise, and +backdoor trigger noise. Impressively, it achieves 87% accuracy in defense +against the backdoor attack by removing triggers. Overall, the proposed +PointCVaR effectively eliminates noise points and enhances point cloud +classification, making it a promising plug-in module for various models in +different scenarios. + +
+
+
+
+
+ + ☆ Nonlinear Meta-Learning Can Guarantee Faster Rates + + +
+ Many recent theoretical works on \emph{meta-learning} aim to achieve +guarantees in leveraging similar representational structures from related tasks +towards simplifying a target task. Importantly, the main aim in theory works on +the subject is to understand the extent to which convergence rates -- in +learning a common representation -- \emph{may scale with the number $N$ of +tasks} (as well as the number of samples per task). First steps in this setting +demonstrate this property when both the shared representation amongst tasks, +and task-specific regression functions, are linear. This linear setting readily +reveals the benefits of aggregating tasks, e.g., via averaging arguments. In +practice, however, the representation is often highly nonlinear, introducing +nontrivial biases in each task that cannot easily be averaged out as in the +linear case. In the present work, we derive theoretical guarantees for +meta-learning with nonlinear representations. In particular, assuming the +shared nonlinearity maps to an infinite-dimensional RKHS, we show that +additional biases can be mitigated with careful regularization that leverages +the smoothness of task-specific regression functions, + +
+
+
+
+
+ + ☆ Performance Issue Identification in Cloud Systems with + Relational-Temporal Anomaly Detection + + +
+ Performance issues permeate large-scale cloud service systems, which can lead +to huge revenue losses. To ensure reliable performance, it's essential to +accurately identify and localize these issues using service monitoring metrics. +Given the complexity and scale of modern cloud systems, this task can be +challenging and may require extensive expertise and resources beyond the +capacity of individual humans. Some existing methods tackle this problem by +analyzing each metric independently to detect anomalies. However, this could +incur overwhelming alert storms that are difficult for engineers to diagnose +manually. To pursue better performance, not only the temporal patterns of +metrics but also the correlation between metrics (i.e., relational patterns) +should be considered, which can be formulated as a multivariate metrics anomaly +detection problem. However, most of the studies fall short of extracting these +two types of features explicitly. Moreover, there exist some unlabeled +anomalies mixed in the training data, which may hinder the detection +performance. To address these limitations, we propose the Relational- Temporal +Anomaly Detection Model (RTAnomaly) that combines the relational and temporal +information of metrics. RTAnomaly employs a graph attention layer to learn the +dependencies among metrics, which will further help pinpoint the anomalous +metrics that may cause the anomaly effectively. In addition, we exploit the +concept of positive unlabeled learning to address the issue of potential +anomalies in the training data. To evaluate our method, we conduct experiments +on a public dataset and two industrial datasets. RTAnomaly outperforms all the +baseline models by achieving an average F1 score of 0.929 and Hit@3 of 0.920, +demonstrating its superiority. + +
+
+
+
+
+ + ☆ FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with + Human Feedback + + +
+ Captions are crucial for understanding scientific visualizations and +documents. Existing captioning methods for scientific figures rely on +figure-caption pairs extracted from documents for training, many of which fall +short with respect to metrics like helpfulness, explainability, and +visual-descriptiveness [15] leading to generated captions being misaligned with +reader preferences. To enable the generation of high-quality figure captions, +we introduce FigCaps-HF a new framework for figure-caption generation that can +incorporate domain expert feedback in generating captions optimized for reader +preferences. Our framework comprises of 1) an automatic method for evaluating +quality of figure-caption pairs, 2) a novel reinforcement learning with human +feedback (RLHF) method to optimize a generative figure-to-caption model for +reader preferences. We demonstrate the effectiveness of our simple learning +framework by improving performance over standard fine-tuning across different +types of models. In particular, when using BLIP as the base model, our RLHF +framework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and +Meteor, respectively. Finally, we release a large-scale benchmark dataset with +human feedback on figure-caption pairs to enable further evaluation and +development of RLHF techniques for this problem. + +
+
+ comment: 19 pages, 4 figures. Benchmark Documentation: + https://figcapshf.github.io/ +
+
+
+
+
+ + ☆ Addressing caveats of neural persistence with deep graph persistence + + +
+ Neural Persistence is a prominent measure for quantifying neural network +complexity, proposed in the emerging field of topological data analysis in deep +learning. In this work, however, we find both theoretically and empirically +that the variance of network weights and spatial concentration of large weights +are the main factors that impact neural persistence. Whilst this captures +useful information for linear classifiers, we find that no relevant spatial +structure is present in later layers of deep neural networks, making neural +persistence roughly equivalent to the variance of weights. Additionally, the +proposed averaging procedure across layers for deep neural networks does not +consider interaction between layers. Based on our analysis, we propose an +extension of the filtration underlying neural persistence to the whole neural +network instead of single layers, which is equivalent to calculating neural +persistence on one particular matrix. This yields our deep graph persistence +measure, which implicitly incorporates persistent paths through the network and +alleviates variance-related issues through standardisation. Code is available +at https://github.com/ExplainableML/Deep-Graph-Persistence . + +
+
+
+
+
+ + ☆ Divide & Bind Your Attention for Improved Generative Semantic Nursing + + +
+ Emerging large-scale text-to-image generative models, e.g., Stable Diffusion +(SD), have exhibited overwhelming results with high fidelity. Despite the +magnificent progress, current state-of-the-art models still struggle to +generate images fully adhering to the input prompt. Prior work, Attend & +Excite, has introduced the concept of Generative Semantic Nursing (GSN), aiming +to optimize cross-attention during inference time to better incorporate the +semantics. It demonstrates promising results in generating simple prompts, +e.g., ``a cat and a dog''. However, its efficacy declines when dealing with +more complex prompts, and it does not explicitly address the problem of +improper attribute binding. To address the challenges posed by complex prompts +or scenarios involving multiple entities and to achieve improved attribute +binding, we propose Divide & Bind. We introduce two novel loss objectives for +GSN: a novel attendance loss and a binding loss. Our approach stands out in its +ability to faithfully synthesize desired objects with improved attribute +alignment from complex prompts and exhibits superior performance across +multiple evaluation benchmarks. More videos and updates can be found on the +project page \url{https://sites.google.com/view/divide-and-bind}. + +
+
+ comment: Project page: \url{https://sites.google.com/view/divide-and-bind} +
+
+
+
+
+ + ☆ Self-paced Weight Consolidation for Continual Learning + + +
+ Continual learning algorithms which keep the parameters of new tasks close to +that of previous tasks, are popular in preventing catastrophic forgetting in +sequential task learning settings. However, 1) the performance for the new +continual learner will be degraded without distinguishing the contributions of +previously learned tasks; 2) the computational cost will be greatly increased +with the number of tasks, since most existing algorithms need to regularize all +previous tasks when learning new tasks. To address the above challenges, we +propose a self-paced Weight Consolidation (spWC) framework to attain robust +continual learning via evaluating the discriminative contributions of previous +tasks. To be specific, we develop a self-paced regularization to reflect the +priorities of past tasks via measuring difficulty based on key performance +indicator (i.e., accuracy). When encountering a new task, all previous tasks +are sorted from "difficult" to "easy" based on the priorities. Then the +parameters of the new continual learner will be learned via selectively +maintaining the knowledge amongst more difficult past tasks, which could well +overcome catastrophic forgetting with less computational cost. We adopt an +alternative convex search to iteratively update the model parameters and +priority weights in the bi-convex formulation. The proposed spWC framework is +plug-and-play, which is applicable to most continual learning algorithms (e.g., +EWC, MAS and RCIL) in different directions (e.g., classification and +segmentation). Experimental results on several public benchmark datasets +demonstrate that our proposed framework can effectively improve performance +when compared with other popular continual learning algorithms. + +
+
+
+
+
+ + ☆ Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals + for GPM: A U-Net Convolutional LSTM Architecture + + +
+ This paper presents a deep learning architecture for nowcasting of +precipitation almost globally every 30 min with a 4-hour lead time. The +architecture fuses a U-Net and a convolutional long short-term memory (LSTM) +neural network and is trained using data from the Integrated MultisatellitE +Retrievals for GPM (IMERG) and a few key precipitation drivers from the Global +Forecast System (GFS). The impacts of different training loss functions, +including the mean-squared error (regression) and the focal-loss +(classification), on the quality of precipitation nowcasts are studied. The +results indicate that the regression network performs well in capturing light +precipitation (below 1.6 mm/hr), but the classification network can outperform +the regression network for nowcasting of precipitation extremes (>8 mm/hr), in +terms of the critical success index (CSI).. Using the Wasserstein distance, it +is shown that the predicted precipitation by the classification network has a +closer class probability distribution to the IMERG than the regression network. +It is uncovered that the inclusion of the physical variables can improve +precipitation nowcasting, especially at longer lead times in both networks. +Taking IMERG as a relative reference, a multi-scale analysis in terms of +fractions skill score (FSS), shows that the nowcasting machine remains skillful +(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For +precipitation rates greater than 4~mm/hr, only the classification network +remains FSS-skillful on scales greater than 50 km within a 2-hour lead time. + +
+
+
+
+
+ + ☆ Label Calibration for Semantic Segmentation Under Domain Shift ICLR 2023 + + +
+ Performance of a pre-trained semantic segmentation model is likely to +substantially decrease on data from a new domain. We show a pre-trained model +can be adapted to unlabelled target domain data by calculating soft-label +prototypes under the domain shift and making predictions according to the +prototype closest to the vector with predicted class probabilities. The +proposed adaptation procedure is fast, comes almost for free in terms of +computational resources and leads to considerable performance improvements. We +demonstrate the benefits of such label calibration on the highly-practical +synthetic-to-real semantic segmentation problem. + +
+
+ comment: ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for + Trustworthy ML +
+
+
+
+
+ + ☆ On Combining Expert Demonstrations in Imitation Learning via Optimal + Transport + + +
+ Imitation learning (IL) seeks to teach agents specific tasks through expert +demonstrations. One of the key approaches to IL is to define a distance between +agent and expert and to find an agent policy that minimizes that distance. +Optimal transport methods have been widely used in imitation learning as they +provide ways to measure meaningful distances between agent and expert +trajectories. However, the problem of how to optimally combine multiple expert +demonstrations has not been widely studied. The standard method is to simply +concatenate state (-action) trajectories, which is problematic when +trajectories are multi-modal. We propose an alternative method that uses a +multi-marginal optimal transport distance and enables the combination of +multiple and diverse state-trajectories in the OT sense, providing a more +sensible geometric average of the demonstrations. Our approach enables an agent +to learn from several experts, and its efficiency is analyzed on OpenAI Gym +control environments and demonstrates that the standard method is not always +optimal. + +
+
+
+
+
+ + ☆ Communication-Efficient Split Learning via Adaptive Feature-Wise + Compression + + +
+ This paper proposes a novel communication-efficient split learning (SL) +framework, named SplitFC, which reduces the communication overhead required for +transmitting intermediate feature and gradient vectors during the SL training +process. The key idea of SplitFC is to leverage different dispersion degrees +exhibited in the columns of the matrices. SplitFC incorporates two compression +strategies: (i) adaptive feature-wise dropout and (ii) adaptive feature-wise +quantization. In the first strategy, the intermediate feature vectors are +dropped with adaptive dropout probabilities determined based on the standard +deviation of these vectors. Then, by the chain rule, the intermediate gradient +vectors associated with the dropped feature vectors are also dropped. In the +second strategy, the non-dropped intermediate feature and gradient vectors are +quantized using adaptive quantization levels determined based on the ranges of +the vectors. To minimize the quantization error, the optimal quantization +levels of this strategy are derived in a closed-form expression. Simulation +results on the MNIST, CIFAR-10, and CelebA datasets demonstrate that SplitFC +provides more than a 5.6% increase in classification accuracy compared to +state-of-the-art SL frameworks, while they require 320 times less communication +overhead compared to the vanilla SL framework without compression. + +
+
+
+
+
+ + ☆ Spatial-Temporal Data Mining for Ocean Science: Data, Methodologies, and + Opportunities + + +
+ With the increasing amount of spatial-temporal~(ST) ocean data, numerous +spatial-temporal data mining (STDM) studies have been conducted to address +various oceanic issues, e.g., climate forecasting and disaster warning. +Compared with typical ST data (e.g., traffic data), ST ocean data is more +complicated with some unique characteristics, e.g., diverse regionality and +high sparsity. These characteristics make it difficult to design and train STDM +models. Unfortunately, an overview of these studies is still missing, hindering +computer scientists to identify the research issues in ocean while discouraging +researchers in ocean science from applying advanced STDM techniques. To remedy +this situation, we provide a comprehensive survey to summarize existing STDM +studies in ocean. Concretely, we first summarize the widely-used ST ocean +datasets and identify their unique characteristics. Then, typical ST ocean data +quality enhancement techniques are discussed. Next, we classify existing STDM +studies for ocean into four types of tasks, i.e., prediction, event detection, +pattern mining, and anomaly detection, and elaborate the techniques for these +tasks. Finally, promising research opportunities are highlighted. This survey +will help scientists from the fields of both computer science and ocean science +have a better understanding of the fundamental concepts, key techniques, and +open challenges of STDM in ocean. + +
+
+
+
+
+ + ☆ Meta-Transformer: A Unified Framework for Multimodal Learning + + +
+ Multimodal learning aims to build models that can process and relate +information from multiple modalities. Despite years of development in this +field, it still remains challenging to design a unified network for processing +various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point +clouds, audio, video, time series, tabular data) due to the inherent gaps among +them. In this work, we propose a framework, named Meta-Transformer, that +leverages a $\textbf{frozen}$ encoder to perform multimodal perception without +any paired multimodal training data. In Meta-Transformer, the raw input data +from various modalities are mapped into a shared token space, allowing a +subsequent encoder with frozen parameters to extract high-level semantic +features of the input data. Composed of three main components: a unified data +tokenizer, a modality-shared encoder, and task-specific heads for downstream +tasks, Meta-Transformer is the first framework to perform unified learning +across 12 modalities with unpaired data. Experiments on different benchmarks +reveal that Meta-Transformer can handle a wide range of tasks including +fundamental perception (text, image, point cloud, audio, video), practical +application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph, +tabular, and time-series). Meta-Transformer indicates a promising future for +developing unified multimodal intelligence with transformers. Code will be +available at https://github.com/invictus717/MetaTransformer + +
+
+ comment: Project website: https://kxgong.github.io/meta_transformer/ +
+
+
+
+
+ + ☆ Optimizing PatchCore for Few/many-shot Anomaly Detection + + +
+ Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and +tries to distinguish between normal and anomalous data using only few selected +samples. While newly proposed few-shot AD methods do compare against +pre-existing algorithms developed for the full-shot domain as baselines, they +do not dedicatedly optimize them for the few-shot setting. It thus remains +unclear if the performance of such pre-existing algorithms can be further +improved. We address said question in this work. Specifically, we present a +study on the AD/anomaly segmentation (AS) performance of PatchCore, the current +state-of-the-art full-shot AD/AS algorithm, in both the few-shot and the +many-shot settings. We hypothesize that further performance improvements can be +realized by (I) optimizing its various hyperparameters, and by (II) +transferring techniques known to improve few-shot supervised learning to the AD +domain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal +that (I) significant performance improvements can be realized by optimizing +hyperparameters such as the underlying feature extractor, and that (II) +image-level augmentations can, but are not guaranteed, to improve performance. +Based on these findings, we achieve a new state of the art in few-shot AD on +VisA, further demonstrating the merit of adapting pre-existing AD/AS methods to +the few-shot setting. Last, we identify the investigation of feature extractors +with a strong inductive bias as a potential future research direction for +(few-shot) AD/AS. + +
+
+
+
+
+ + ☆ Adversarial attacks for mixtures of classifiers + + +
+ Mixtures of classifiers (a.k.a. randomized ensembles) have been proposed as a +way to improve robustness against adversarial attacks. However, it has been +shown that existing attacks are not well suited for this kind of classifiers. +In this paper, we discuss the problem of attacking a mixture in a principled +way and introduce two desirable properties of attacks based on a geometrical +analysis of the problem (effectiveness and maximality). We then show that +existing attacks do not meet both of these properties. Finally, we introduce a +new attack called lattice climber attack with theoretical guarantees on the +binary linear setting, and we demonstrate its performance by conducting +experiments on synthetic and real datasets. + +
+
+ comment: 7 pages + 4 pages of appendix. 5 figures in main text +
+
+
+
+
+ + ☆ Feed-Forward Source-Free Domain Adaptation via Class Prototypes ECCV 2022 + + +
+ Source-free domain adaptation has become popular because of its practical +usefulness and no need to access source data. However, the adaptation process +still takes a considerable amount of time and is predominantly based on +optimization that relies on back-propagation. In this work we present a simple +feed-forward approach that challenges the need for back-propagation based +adaptation. Our approach is based on computing prototypes of classes under the +domain shift using a pre-trained model. It achieves strong improvements in +accuracy compared to the pre-trained model and requires only a small fraction +of time of existing domain adaptation methods. + +
+
+ comment: ECCV 2022 Workshop on Out of Distribution Generalization in Computer + Vision (OOD-CV) +
+
+
+
+
+ + ☆ Efficient Beam Tree Recursion + + +
+ Beam Tree Recursive Neural Network (BT-RvNN) was recently proposed as a +simple extension of Gumbel Tree RvNN and it was shown to achieve +state-of-the-art length generalization performance in ListOps while maintaining +comparable performance on other tasks. However, although not the worst in its +kind, BT-RvNN can be still exorbitantly expensive in memory usage. In this +paper, we identify the main bottleneck in BT-RvNN's memory usage to be the +entanglement of the scorer function and the recursive cell function. We propose +strategies to remove this bottleneck and further simplify its memory usage. +Overall, our strategies not only reduce the memory usage of BT-RvNN by +$10$-$16$ times but also create a new state-of-the-art in ListOps while +maintaining similar performance in other tasks. In addition, we also propose a +strategy to utilize the induced latent-tree node representations produced by +BT-RvNN to turn BT-RvNN from a sentence encoder of the form $f:\mathbb{R}^{n +\times d} \rightarrow \mathbb{R}^{d}$ into a sequence contextualizer of the +form $f:\mathbb{R}^{n \times d} \rightarrow \mathbb{R}^{n \times d}$. Thus, our +proposals not only open up a path for further scalability of RvNNs but also +standardize a way to use BT-RvNNs as another building block in the deep +learning toolkit that can be easily stacked or interfaced with other popular +models such as Transformers and Structured State Space models. + +
+
+
+
+
+ + ☆ Assessing the Use of AutoML for Data-Driven Software Engineering + + +
+ Background. Due to the widespread adoption of Artificial Intelligence (AI) +and Machine Learning (ML) for building software applications, companies are +struggling to recruit employees with a deep understanding of such technologies. +In this scenario, AutoML is soaring as a promising solution to fill the AI/ML +skills gap since it promises to automate the building of end-to-end AI/ML +pipelines that would normally be engineered by specialized team members. Aims. +Despite the growing interest and high expectations, there is a dearth of +information about the extent to which AutoML is currently adopted by teams +developing AI/ML-enabled systems and how it is perceived by practitioners and +researchers. Method. To fill these gaps, in this paper, we present a +mixed-method study comprising a benchmark of 12 end-to-end AutoML tools on two +SE datasets and a user survey with follow-up interviews to further our +understanding of AutoML adoption and perception. Results. We found that AutoML +solutions can generate models that outperform those trained and optimized by +researchers to perform classification tasks in the SE domain. Also, our +findings show that the currently available AutoML solutions do not live up to +their names as they do not equally support automation across the stages of the +ML development workflow and for all the team members. Conclusions. We derive +insights to inform the SE research community on how AutoML can facilitate their +activities and tool builders on how to design the next generation of AutoML +technologies. + +
+
+
+
+
+ + ☆ Music Genre Classification with ResNet and Bi-GRU Using Visual + Spectrograms + + +
+ Music recommendation systems have emerged as a vital component to enhance +user experience and satisfaction for the music streaming services, which +dominates music consumption. The key challenge in improving these recommender +systems lies in comprehending the complexity of music data, specifically for +the underpinning music genre classification. The limitations of manual genre +classification have highlighted the need for a more advanced system, namely the +Automatic Music Genre Classification (AMGC) system. While traditional machine +learning techniques have shown potential in genre classification, they heavily +rely on manually engineered features and feature selection, failing to capture +the full complexity of music data. On the other hand, deep learning +classification architectures like the traditional Convolutional Neural Networks +(CNN) are effective in capturing the spatial hierarchies but struggle to +capture the temporal dynamics inherent in music data. To address these +challenges, this study proposes a novel approach using visual spectrograms as +input, and propose a hybrid model that combines the strength of the Residual +neural Network (ResNet) and the Gated Recurrent Unit (GRU). This model is +designed to provide a more comprehensive analysis of music data, offering the +potential to improve the music recommender systems through achieving a more +comprehensive analysis of music data and hence potentially more accurate genre +classification. + +
+
+
+
+
+ + ☆ Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of + Working Memory + + +
+ Working memory (WM), a fundamental cognitive process facilitating the +temporary storage, integration, manipulation, and retrieval of information, +plays a vital role in reasoning and decision-making tasks. Robust benchmark +datasets that capture the multifaceted nature of WM are crucial for the +effective development and evaluation of AI WM models. Here, we introduce a +comprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM +comprises 10 tasks and a total of 1 million trials, assessing 4 +functionalities, 3 domains, and 11 behavioral and neural characteristics of WM. +We jointly trained and tested state-of-the-art recurrent neural networks and +transformers on all these tasks. We also include human behavioral benchmarks as +an upper bound for comparison. Our results suggest that AI models replicate +some characteristics of WM in the brain, most notably primacy and recency +effects, and neural clusters and correlates specialized for different domains +and functionalities of WM. In the experiments, we also reveal some limitations +in existing models to approximate human behavior. This dataset serves as a +valuable resource for communities in cognitive psychology, neuroscience, and +AI, offering a standardized framework to compare and enhance WM models, +investigate WM's neural underpinnings, and develop WM models with human-like +capabilities. Our source code and data are available at +https://github.com/ZhangLab-DeepNeuroCogLab/WorM. + +
+
+
+
+
+ + ☆ MSQNet: Actor-agnostic Action Recognition with Multi-modal Query + + +
+ Existing action recognition methods are typically actor-specific due to the +intrinsic topological and apparent differences among the actors. This requires +actor-specific pose estimation (e.g., humans vs. animals), leading to +cumbersome model design complexity and high maintenance costs. Moreover, they +often focus on learning the visual modality alone and single-label +classification whilst neglecting other available information sources (e.g., +class name text) and the concurrent occurrence of multiple actions. To overcome +these limitations, we propose a new approach called 'actor-agnostic multi-modal +multi-label action recognition,' which offers a unified solution for various +types of actors, including humans and animals. We further formulate a novel +Multi-modal Semantic Query Network (MSQNet) model in a transformer-based object +detection framework (e.g., DETR), characterized by leveraging visual and +textual modalities to represent the action classes better. The elimination of +actor-specific model designs is a key advantage, as it removes the need for +actor pose estimation altogether. Extensive experiments on five publicly +available benchmarks show that our MSQNet consistently outperforms the prior +arts of actor-specific alternatives on human and animal single- and multi-label +action recognition tasks by up to 50%. Code will be released at +https://github.com/mondalanindya/MSQNet. + +
+
+
+
+
+ + ☆ Mitigating Voter Attribute Bias for Fair Opinion Aggregation + + +
+ The aggregation of multiple opinions plays a crucial role in decision-making, +such as in hiring and loan review, and in labeling data for supervised +learning. Although majority voting and existing opinion aggregation models are +effective for simple tasks, they are inappropriate for tasks without +objectively true labels in which disagreements may occur. In particular, when +voter attributes such as gender or race introduce bias into opinions, the +aggregation results may vary depending on the composition of voter attributes. +A balanced group of voters is desirable for fair aggregation results but may be +difficult to prepare. In this study, we consider methods to achieve fair +opinion aggregation based on voter attributes and evaluate the fairness of the +aggregated results. To this end, we consider an approach that combines opinion +aggregation models such as majority voting and the Dawid and Skene model (D&S +model) with fairness options such as sample weighting. To evaluate the fairness +of opinion aggregation, probabilistic soft labels are preferred over discrete +class labels. First, we address the problem of soft label estimation without +considering voter attributes and identify some issues with the D&S model. To +address these limitations, we propose a new Soft D&S model with improved +accuracy in estimating soft labels. Moreover, we evaluated the fairness of an +opinion aggregation model, including Soft D&S, in combination with different +fairness options using synthetic and semi-synthetic data. The experimental +results suggest that the combination of Soft D&S and data splitting as a +fairness option is effective for dense data, whereas weighted majority voting +is effective for sparse data. These findings should prove particularly valuable +in supporting decision-making by human and machine-learning models with +balanced opinion aggregation. + +
+
+
+
+
+ + ☆ Fairness-Aware Client Selection for Federated Learning ICME 2023 + + +
+ Federated learning (FL) has enabled multiple data owners (a.k.a. FL clients) +to train machine learning models collaboratively without revealing private +data. Since the FL server can only engage a limited number of clients in each +training round, FL client selection has become an important research problem. +Existing approaches generally focus on either enhancing FL model performance or +enhancing the fair treatment of FL clients. The problem of balancing +performance and fairness considerations when selecting FL clients remains open. +To address this problem, we propose the Fairness-aware Federated Client +Selection (FairFedCS) approach. Based on Lyapunov optimization, it dynamically +adjusts FL clients' selection probabilities by jointly considering their +reputations, times of participation in FL tasks and contributions to the +resulting model performance. By not using threshold-based reputation filtering, +it provides FL clients with opportunities to redeem their reputations after a +perceived poor performance, thereby further enhancing fair client treatment. +Extensive experiments based on real-world multimedia datasets show that +FairFedCS achieves 19.6% higher fairness and 0.73% higher test accuracy on +average than the best-performing state-of-the-art approach. + +
+
+ comment: Accepted by ICME 2023 +
+
+
+
+
+ + ☆ Long-Tail Theory under Gaussian Mixtures ECAI 2023 + + +
+ We suggest a simple Gaussian mixture model for data generation that complies +with Feldman's long tail theory (2020). We demonstrate that a linear classifier +cannot decrease the generalization error below a certain level in the proposed +model, whereas a nonlinear classifier with a memorization capacity can. This +confirms that for long-tailed distributions, rare training examples must be +considered for optimal generalization to new data. Finally, we show that the +performance gap between linear and nonlinear models can be lessened as the tail +becomes shorter in the subpopulation frequency distribution, as confirmed by +experiments on synthetic and real data. + +
+
+ comment: accepted to ECAI 2023 +
+
+
+
+
+ + ☆ Differences Between Hard and Noisy-labeled Samples: An Empirical Study + + +
+ Extracting noisy or incorrectly labeled samples from a labeled dataset with +hard/difficult samples is an important yet under-explored topic. Two general +and often independent lines of work exist, one focuses on addressing noisy +labels, and another deals with hard samples. However, when both types of data +are present, most existing methods treat them equally, which results in a +decline in the overall performance of the model. In this paper, we first design +various synthetic datasets with custom hardness and noisiness levels for +different samples. Our proposed systematic empirical study enables us to better +understand the similarities and more importantly the differences between +hard-to-learn samples and incorrectly-labeled samples. These controlled +experiments pave the way for the development of methods that distinguish +between hard and noisy samples. Through our study, we introduce a simple yet +effective metric that filters out noisy-labeled samples while keeping the hard +samples. We study various data partitioning methods in the presence of label +noise and observe that filtering out noisy samples from hard samples with this +proposed metric results in the best datasets as evidenced by the high test +accuracy achieved after models are trained on the filtered datasets. We +demonstrate this for both our created synthetic datasets and for datasets with +real-world label noise. Furthermore, our proposed data partitioning method +significantly outperforms other methods when employed within a semi-supervised +learning framework. + +
+
+
+
+
+ + ☆ Reparameterized Policy Learning for Multimodal Trajectory Optimization + + +
+ We investigate the challenge of parametrizing policies for reinforcement +learning (RL) in high-dimensional continuous action spaces. Our objective is to +develop a multimodal policy that overcomes limitations inherent in the +commonly-used Gaussian parameterization. To achieve this, we propose a +principled framework that models the continuous RL policy as a generative model +of optimal trajectories. By conditioning the policy on a latent variable, we +derive a novel variational bound as the optimization objective, which promotes +exploration of the environment. We then present a practical model-based RL +method, called Reparameterized Policy Gradient (RPG), which leverages the +multimodal policy parameterization and learned world model to achieve strong +exploration capabilities and high data efficiency. Empirical results +demonstrate that our method can help agents evade local optima in tasks with +dense rewards and solve challenging sparse-reward environments by incorporating +an object-centric intrinsic reward. Our method consistently outperforms +previous approaches across a range of tasks. Code and supplementary materials +are available on the project page https://haosulab.github.io/RPG/ + +
+
+
+
+
+ + ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and + Lane Segmentation in Self-Driving Cars + + +
+ Semantic segmentation is a common task in autonomous driving to understand +the surrounding environment. Driveable Area Segmentation and Lane Detection are +particularly important for safe and efficient navigation on the road. However, +original semantic segmentation models are computationally expensive and require +high-end hardware, which is not feasible for embedded systems in autonomous +vehicles. This paper proposes a lightweight model for the driveable area and +lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate +and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K +dataset and compare it with modern models. Experimental results show that our +TwinLiteNet performs similarly to existing approaches, requiring significantly +fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score +of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task +with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000. +Furthermore, TwinLiteNet can run in real-time on embedded devices with limited +computing power, especially since it achieves 60FPS on Jetson Xavier NX, making +it an ideal solution for self-driving vehicles. Code is available: +url{https://github.com/chequanghuy/TwinLiteNet}. + +
+
+
+
+
+ + ☆ Decentralized Smart Charging of Large-Scale EVs using Adaptive + Multi-Agent Multi-Armed Bandits + + +
+ The drastic growth of electric vehicles and photovoltaics can introduce new +challenges, such as electrical current congestion and voltage limit violations +due to peak load demands. These issues can be mitigated by controlling the +operation of electric vehicles i.e., smart charging. Centralized smart charging +solutions have already been proposed in the literature. But such solutions may +lack scalability and suffer from inherent drawbacks of centralization, such as +a single point of failure, and data privacy concerns. Decentralization can help +tackle these challenges. In this paper, a fully decentralized smart charging +system is proposed using the philosophy of adaptive multi-agent systems. The +proposed system utilizes multi-armed bandit learning to handle uncertainties in +the system. The presented system is decentralized, scalable, real-time, +model-free, and takes fairness among different players into account. A detailed +case study is also presented for performance evaluation. + +
+
+ comment: CIRED 2023 International Conference & Exhibition on Electricity + Distribution, Jun 2023, Rome, Italy +
+
+
+
+
+ + ☆ Graphs in State-Space Models for Granger Causality in Climate Science + + +
+ Granger causality (GC) is often considered not an actual form of causality. +Still, it is arguably the most widely used method to assess the predictability +of a time series from another one. Granger causality has been widely used in +many applied disciplines, from neuroscience and econometrics to Earth sciences. +We revisit GC under a graphical perspective of state-space models. For that, we +use GraphEM, a recently presented expectation-maximisation algorithm for +estimating the linear matrix operator in the state equation of a +linear-Gaussian state-space model. Lasso regularisation is included in the +M-step, which is solved using a proximal splitting Douglas-Rachford algorithm. +Experiments in toy examples and challenging climate problems illustrate the +benefits of the proposed model and inference technique over standard Granger +causality methods. + +
+
+ comment: 4 pages, 2 figures, 3 tables, CausalStats23: When Causal Inference + meets Statistical Analysis, April 17-21, 2023, Paris, France +
+
+
+
+
+ + ☆ Self2Self+: Single-Image Denoising with Self-Supervised Learning and + Image Quality Assessment Loss + + +
+ Recently, denoising methods based on supervised learning have exhibited +promising performance. However, their reliance on external datasets containing +noisy-clean image pairs restricts their applicability. To address this +limitation, researchers have focused on training denoising networks using +solely a set of noisy inputs. To improve the feasibility of denoising +procedures, in this study, we proposed a single-image self-supervised learning +method in which only the noisy input image is used for network training. Gated +convolution was used for feature extraction and no-reference image quality +assessment was used for guiding the training process. Moreover, the proposed +method sampled instances from the input image dataset using Bernoulli sampling +with a certain dropout rate for training. The corresponding result was produced +by averaging the generated predictions from various instances of the trained +network with dropouts. The experimental results indicated that the proposed +method achieved state-of-the-art denoising performance on both synthetic and +real-world datasets. This highlights the effectiveness and practicality of our +method as a potential solution for various noise removal tasks. + +
+
+ comment: Technical report and supplemantry materials are combined into one + paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18 +
+
+
+
+
+ + ☆ Fractional Denoising for 3D Molecular Pre-training + + +
+ Coordinate denoising is a promising 3D molecular pre-training method, which +has achieved remarkable performance in various downstream drug discovery tasks. +Theoretically, the objective is equivalent to learning the force field, which +is revealed helpful for downstream tasks. Nevertheless, there are two +challenges for coordinate denoising to learn an effective force field, i.e. low +coverage samples and isotropic force field. The underlying reason is that +molecular distributions assumed by existing denoising methods fail to capture +the anisotropic characteristic of molecules. To tackle these challenges, we +propose a novel hybrid noise strategy, including noises on both dihedral angel +and coordinate. However, denoising such hybrid noise in a traditional way is no +more equivalent to learning the force field. Through theoretical deductions, we +find that the problem is caused by the dependency of the input conformation for +covariance. To this end, we propose to decouple the two types of noise and +design a novel fractional denoising method (Frad), which only denoises the +latter coordinate part. In this way, Frad enjoys both the merits of sampling +more low-energy structures and the force field equivalence. Extensive +experiments show the effectiveness of Frad in molecular representation, with a +new state-of-the-art on 9 out of 12 tasks of QM9 and on 7 out of 8 targets of +MD17. + +
+
+
+
+
+ + ☆ Deep learning for classification of noisy QR codes + + +
+ We wish to define the limits of a classical classification model based on +deep learning when applied to abstract images, which do not represent visually +identifiable objects.QR codes (Quick Response codes) fall into this category of +abstract images: one bit corresponding to one encoded character, QR codes were +not designed to be decoded manually. To understand the limitations of a deep +learning-based model for abstract image classification, we train an image +classification model on QR codes generated from information obtained when +reading a health pass. We compare a classification model with a classical +(deterministic) decoding method in the presence of noise. This study allows us +to conclude that a model based on deep learning can be relevant for the +understanding of abstract images. + +
+
+ comment: in French language. RFIAP 2022 - Reconnaissance des Formes, Image, + Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France +
+
+
+
+
+ + ☆ A Survey of What to Share in Federated Learning: Perspectives on Model + Utility, Privacy Leakage, and Communication Efficiency + + +
+ Federated learning (FL) has emerged as a highly effective paradigm for +privacy-preserving collaborative training among different parties. Unlike +traditional centralized learning, which requires collecting data from each +party, FL allows clients to share privacy-preserving information without +exposing private datasets. This approach not only guarantees enhanced privacy +protection but also facilitates more efficient and secure collaboration among +multiple participants. Therefore, FL has gained considerable attention from +researchers, promoting numerous surveys to summarize the related works. +However, the majority of these surveys concentrate on methods sharing model +parameters during the training process, while overlooking the potential of +sharing other forms of local information. In this paper, we present a +systematic survey from a new perspective, i.e., what to share in FL, with an +emphasis on the model utility, privacy leakage, and communication efficiency. +This survey differs from previous ones due to four distinct contributions. +First, we present a new taxonomy of FL methods in terms of the sharing methods, +which includes three categories of shared information: model sharing, synthetic +data sharing, and knowledge sharing. Second, we analyze the vulnerability of +different sharing methods to privacy attacks and review the defense mechanisms +that provide certain privacy guarantees. Third, we conduct extensive +experiments to compare the performance and communication overhead of various +sharing methods in FL. Besides, we assess the potential privacy leakage through +model inversion and membership inference attacks, while comparing the +effectiveness of various defense approaches. Finally, we discuss potential +deficiencies in current methods and outline future directions for improvement. + +
+
+
+
+
+ + ☆ Conditional expectation network for SHAP + + +
+ A very popular model-agnostic technique for explaining predictive models is +the SHapley Additive exPlanation (SHAP). The two most popular versions of SHAP +are a conditional expectation version and an unconditional expectation version +(the latter is also known as interventional SHAP). Except for tree-based +methods, usually the unconditional version is used (for computational reasons). +We provide a (surrogate) neural network approach which allows us to efficiently +calculate the conditional version for both neural networks and other regression +models, and which properly considers the dependence structure in the feature +components. This proposal is also useful to provide drop1 and anova analyses in +complex regression models which are similar to their generalized linear model +(GLM) counterparts, and we provide a partial dependence plot (PDP) counterpart +that considers the right dependence structure in the feature components. + +
+
+ comment: 24 pages, 9 figures +
+
+
+
+
+ + ☆ Refining the Optimization Target for Automatic Univariate Time Series + Anomaly Detection in Monitoring Services IJCAI + + +
+ Time series anomaly detection is crucial for industrial monitoring services +that handle a large volume of data, aiming to ensure reliability and optimize +system performance. Existing methods often require extensive labeled resources +and manual parameter selection, highlighting the need for automation. This +paper proposes a comprehensive framework for automatic parameter optimization +in time series anomaly detection models. The framework introduces three +optimization targets: prediction score, shape score, and sensitivity score, +which can be easily adapted to different model backbones without prior +knowledge or manual labeling efforts. The proposed framework has been +successfully applied online for over six months, serving more than 50,000 time +series every minute. It simplifies the user's experience by requiring only an +expected sensitive value, offering a user-friendly interface, and achieving +desired detection results. Extensive evaluations conducted on public datasets +and comparison with other methods further confirm the effectiveness of the +proposed framework. + +
+
+ comment: Accepted by 2023 IJCAI Workshop +
+
+
+
+
+ + ☆ Data-Driven Latency Probability Prediction for Wireless Networks: + Focusing on Tail Probabilities + + +
+ With the emergence of new application areas, such as cyber-physical systems +and human-in-the-loop applications, there is a need to guarantee a certain +level of end-to-end network latency with extremely high reliability, e.g., +99.999%. While mechanisms specified under IEEE 802.1as time-sensitive +networking (TSN) can be used to achieve these requirements for switched +Ethernet networks, implementing TSN mechanisms in wireless networks is +challenging due to their stochastic nature. To conform the wireless link to a +reliability level of 99.999%, the behavior of extremely rare outliers in the +latency probability distribution, or the tail of the distribution, must be +analyzed and controlled. This work proposes predicting the tail of the latency +distribution using state-of-the-art data-driven approaches, such as mixture +density networks (MDN) and extreme value mixture models, to estimate the +likelihood of rare latencies conditioned on the network parameters, which can +be used to make more informed decisions in wireless transmission. Actual +latency measurements of IEEE 802.11g (WiFi), commercial private and a +software-defined 5G network are used to benchmark the proposed approaches and +evaluate their sensitivities concerning the tail probabilities. + +
+
+ comment: Submitted to IEEE Global Communications (GLOBECOM) 2023 conference +
+
+
+
+
+ + ☆ Fisher-Rao distance and pullback SPD cone distances between multivariate + normal distributions + + +
+ Data sets of multivariate normal distributions abound in many scientific +areas like diffusion tensor imaging, structure tensor computer vision, radar +signal processing, machine learning, just to name a few. In order to process +those normal data sets for downstream tasks like filtering, classification or +clustering, one needs to define proper notions of dissimilarities between +normals and paths joining them. The Fisher-Rao distance defined as the +Riemannian geodesic distance induced by the Fisher information metric is such a +principled metric distance which however is not known in closed-form excepts +for a few particular cases. In this work, we first report a fast and robust +method to approximate arbitrarily finely the Fisher-Rao distance between +multivariate normal distributions. Second, we introduce a class of distances +based on diffeomorphic embeddings of the normal manifold into a submanifold of +the higher-dimensional symmetric positive-definite cone corresponding to the +manifold of centered normal distributions. We show that the projective Hilbert +distance on the cone yields a metric on the embedded normal submanifold and we +pullback that cone distance with its associated straight line Hilbert cone +geodesics to obtain a distance and smooth paths between normal distributions. +Compared to the Fisher-Rao distance approximation, the pullback Hilbert cone +distance is computationally light since it requires to compute only the extreme +minimal and maximal eigenvalues of matrices. Finally, we show how to use those +distances in clustering tasks. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ☆ SciBench: Evaluating College-Level Scientific Problem-Solving Abilities + of Large Language Models + + +
+ Recent advances in large language models (LLMs) have demonstrated notable +progress on many mathematical benchmarks. However, most of these benchmarks +only feature problems grounded in junior and senior high school subjects, +contain only multiple-choice questions, and are confined to a limited scope of +elementary arithmetic operations. To address these issues, this paper +introduces an expansive benchmark suite SciBench that aims to systematically +examine the reasoning capabilities required for complex scientific problem +solving. SciBench contains two carefully curated datasets: an open set +featuring a range of collegiate-level scientific problems drawn from +mathematics, chemistry, and physics textbooks, and a closed set comprising +problems from undergraduate-level exams in computer science and mathematics. +Based on the two datasets, we conduct an in-depth benchmark study of two +representative LLMs with various prompting strategies. The results reveal that +current LLMs fall short of delivering satisfactory performance, with an overall +score of merely 35.80%. Furthermore, through a detailed user study, we +categorize the errors made by LLMs into ten problem-solving abilities. Our +analysis indicates that no single prompting strategy significantly outperforms +others and some strategies that demonstrate improvements in certain +problem-solving skills result in declines in other skills. We envision that +SciBench will catalyze further developments in the reasoning abilities of LLMs, +thereby ultimately contributing to scientific research and discovery. + +
+
+ comment: Work in progress, 18 pages +
+
+
+
+
+ + ☆ Generative Language Models on Nucleotide Sequences of Human Genes + + +
+ Language models, primarily transformer-based ones, obtained colossal success +in NLP. To be more precise, studies like BERT in NLU and works such as GPT-3 +for NLG are very crucial. DNA sequences are very close to natural language in +terms of structure, so if the DNA-related bioinformatics domain is concerned, +discriminative models, like DNABert, exist. Yet, the generative side of the +coin is mainly unexplored to the best of our knowledge. Consequently, we +focused on developing an autoregressive generative language model like GPT-3 +for DNA sequences. Because working with whole DNA sequences is challenging +without substantial computational resources, we decided to carry out our study +on a smaller scale, focusing on nucleotide sequences of human genes, unique +parts in DNA with specific functionalities, instead of the whole DNA. This +decision did not change the problem structure a lot due to the fact that both +DNA and genes can be seen as 1D sequences consisting of four different +nucleotides without losing much information and making too much simplification. +First of all, we systematically examined an almost entirely unexplored problem +and observed that RNNs performed the best while simple techniques like N-grams +were also promising. Another beneficial point was learning how to work with +generative models on languages we do not understand, unlike natural language. +How essential using real-life tasks beyond the classical metrics such as +perplexity is observed. Furthermore, checking whether the data-hungry nature of +these models can be changed through selecting a language with minimal +vocabulary size, four owing to four different types of nucleotides, is +examined. The reason for reviewing this was that choosing such a language might +make the problem easier. However, what we observed in this study was it did not +provide that much of a change in the amount of data needed. + +
+
+
+
+
+ + ☆ Multi-Method Self-Training: Improving Code Generation With Text, And + Vice Versa + + +
+ Large Language Models have many methods for solving the same problem. This +introduces novel strengths (different methods may work well for different +problems) and weaknesses (it may be difficult for users to know which method to +use). In this paper, we introduce Multi-Method Self-Training (MMST), where one +method is trained on the filtered outputs of another, allowing us to augment +the strengths and ameliorate the weaknesses of each method. Using a 176B +parameter model trained on both language and code, we show that MMST can 1) +improve the less performant method (up to 30%) making the model easier to use, +2) improve the more performant method (up to 32.2%) making the model more +performant, and 3) improve the performance of related but distinct tasks (up to +10.3%) by improving the ability of the model to generate rationales. We then +conduct ablation analyses to explore why MMST works. We show that MMST +generates more data than traditional self-training, but the improvement in +performance is driven by the use of multiple methods. We also analyze +prompt-engineering and anti-correlated performance between methods as means of +making MMST more effective. We hope the evidence from our paper motivates +machine learning researchers to explore ways in which advances in language +models allow for new forms of training. + +
+
+ comment: 23 pages, 3 figures +
+
+
+
+
+ + ☆ Detecting deceptive reviews using text classification + + +
+ In recent years, online reviews play a vital role for promoting any kind of +product or services. Businesses may embed fake reviews in order to attract +customers to purchase their products. They may even highlight the benefits of +their own product or criticize the competition's product. Marketers, +advertisers, and other online business users have incentive to create fake +positive reviews for products which they want to promote or give fake negative +reviews for products which they really don't like. So now-a-days writing a +deceptive review is inevitable thing for promoting their own business or +degrading competitor's reputation. Thus, identifying deceptive reviews is an +intense and on-going research area. This research paper proposes machine +learning model approach to identify deceptive reviews. The paper investigates +the performance of the several experiments done on a Deceptive Opinion Spam +Corpus dataset of restaurants reviews. We developed a n-gram model and max +features to identify deceptive contents with a particular focus on fake +reviews. Further, we conduct a benchmark study to investigate the performance +of two different features extraction techniques and apply five machine learning +classification techniques. The experimental results show that passive +aggressive classifier outperforms other algorithms, and it reaches the highest +accuracy not only in text classification but also to fake reviews. We also +study the data augmentation and implement different deep learning techniques. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Heterogeneous Federated Learning: State-of-the-art and Research + Challenges + + +
+ Federated learning (FL) has drawn increasing attention owing to its potential +use in large-scale industrial applications. Existing federated learning works +mainly focus on model homogeneous settings. However, practical federated +learning typically faces the heterogeneity of data distributions, model +architectures, network environments, and hardware devices among participant +clients. Heterogeneous Federated Learning (HFL) is much more challenging, and +corresponding solutions are diverse and complex. Therefore, a systematic survey +on this topic about the research challenges and state-of-the-art is essential. +In this survey, we firstly summarize the various research challenges in HFL +from five aspects: statistical heterogeneity, model heterogeneity, +communication heterogeneity, device heterogeneity, and additional challenges. +In addition, recent advances in HFL are reviewed and a new taxonomy of existing +HFL methods is proposed with an in-depth analysis of their pros and cons. We +classify existing methods from three different levels according to the HFL +procedure: data-level, model-level, and server-level. Finally, several critical +and promising future research directions in HFL are discussed, which may +facilitate further developments in this field. A periodically updated +collection on HFL is available at https://github.com/marswhu/HFL_Survey. + +
+
+ comment: 42 pages, 11 figures, and 4 tables +
+
+
+
+
+ + ☆ Ensemble Learning based Anomaly Detection for IoT Cybersecurity via + Bayesian Hyperparameters Sensitivity Analysis + + +
+ The Internet of Things (IoT) integrates more than billions of intelligent +devices over the globe with the capability of communicating with other +connected devices with little to no human intervention. IoT enables data +aggregation and analysis on a large scale to improve life quality in many +domains. In particular, data collected by IoT contain a tremendous amount of +information for anomaly detection. The heterogeneous nature of IoT is both a +challenge and an opportunity for cybersecurity. Traditional approaches in +cybersecurity monitoring often require different kinds of data pre-processing +and handling for various data types, which might be problematic for datasets +that contain heterogeneous features. However, heterogeneous types of network +devices can often capture a more diverse set of signals than a single type of +device readings, which is particularly useful for anomaly detection. In this +paper, we present a comprehensive study on using ensemble machine learning +methods for enhancing IoT cybersecurity via anomaly detection. Rather than +using one single machine learning model, ensemble learning combines the +predictive power from multiple models, enhancing their predictive accuracy in +heterogeneous datasets rather than using one single machine learning model. We +propose a unified framework with ensemble learning that utilises Bayesian +hyperparameter optimisation to adapt to a network environment that contains +multiple IoT sensor readings. Experimentally, we illustrate their high +predictive power when compared to traditional methods. + +
+
+
+
+
+ + ☆ Forecasting Battery Electric Vehicle Charging Behavior: A Deep Learning + Approach Equipped with Micro-Clustering and SMOTE Techniques + + +
+ Energy systems, climate change, and public health are among the primary +reasons for moving toward electrification in transportation. Transportation +electrification is being promoted worldwide to reduce emissions. As a result, +many automakers will soon start making only battery electric vehicles (BEVs). +BEV adoption rates are rising in California, mainly due to climate change and +air pollution concerns. While great for climate and pollution goals, improperly +managed BEV charging can lead to insufficient charging infrastructure and power +outages. This study develops a novel Micro Clustering Deep Neural Network +(MCDNN), an artificial neural network algorithm that is highly effective at +learning BEVs trip and charging data to forecast BEV charging events, +information that is essential for electricity load aggregators and utility +managers to provide charging stations and electricity capacity effectively. The +MCDNN is configured using a robust dataset of trips and charges that occurred +in California between 2015 and 2020 from 132 BEVs, spanning 5 BEV models for a +total of 1570167 vehicle miles traveled. The numerical findings revealed that +the proposed MCDNN is more effective than benchmark approaches in this field, +such as support vector machine, k nearest neighbors, decision tree, and other +neural network-based models in predicting the charging events. + +
+
+ comment: 18 pages,8 figures, 4 tables +
+
+
+
+
+ + ☆ A Holistic Assessment of the Reliability of Machine Learning Systems + + +
+ As machine learning (ML) systems increasingly permeate high-stakes settings +such as healthcare, transportation, military, and national security, concerns +regarding their reliability have emerged. Despite notable progress, the +performance of these systems can significantly diminish due to adversarial +attacks or environmental changes, leading to overconfident predictions, +failures to detect input faults, and an inability to generalize in unexpected +scenarios. This paper proposes a holistic assessment methodology for the +reliability of ML systems. Our framework evaluates five key properties: +in-distribution accuracy, distribution-shift robustness, adversarial +robustness, calibration, and out-of-distribution detection. A reliability score +is also introduced and used to assess the overall system reliability. To +provide insights into the performance of different algorithmic approaches, we +identify and categorize state-of-the-art techniques, then evaluate a selection +on real-world tasks using our proposed reliability metrics and reliability +score. Our analysis of over 500 models reveals that designing for one metric +does not necessarily constrain others but certain algorithmic techniques can +improve reliability across multiple metrics simultaneously. This study +contributes to a more comprehensive understanding of ML reliability and +provides a roadmap for future research and development. + +
+
+
+
+
+ + ☆ Intelligent model for offshore China sea fog forecasting + + +
+ Accurate and timely prediction of sea fog is very important for effectively +managing maritime and coastal economic activities. Given the intricate nature +and inherent variability of sea fog, traditional numerical and statistical +forecasting methods are often proven inadequate. This study aims to develop an +advanced sea fog forecasting method embedded in a numerical weather prediction +model using the Yangtze River Estuary (YRE) coastal area as a case study. Prior +to training our machine learning model, we employ a time-lagged correlation +analysis technique to identify key predictors and decipher the underlying +mechanisms driving sea fog occurrence. In addition, we implement ensemble +learning and a focal loss function to address the issue of imbalanced data, +thereby enhancing the predictive ability of our model. To verify the accuracy +of our method, we evaluate its performance using a comprehensive dataset +spanning one year, which encompasses both weather station observations and +historical forecasts. Remarkably, our machine learning-based approach surpasses +the predictive performance of two conventional methods, the weather research +and forecasting nonhydrostatic mesoscale model (WRF-NMM) and the algorithm +developed by the National Oceanic and Atmospheric Administration (NOAA) +Forecast Systems Laboratory (FSL). Specifically, in regard to predicting sea +fog with a visibility of less than or equal to 1 km with a lead time of 60 +hours, our methodology achieves superior results by increasing the probability +of detection (POD) while simultaneously reducing the false alarm ratio (FAR). + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ☆ SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning + + +
+ SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to +protect data privacy in vertical federated learning setting. It is widely used +in fields such as finance and healthcare due to its interpretability, +effectiveness, and privacy-preserving capability. However, SecureBoost suffers +from high computational complexity and risk of label leakage. To harness the +full potential of SecureBoost, hyperparameters of SecureBoost should be +carefully chosen to strike an optimal balance between utility, efficiency, and +privacy. Existing methods either set hyperparameters empirically or +heuristically, which are far from optimal. To fill this gap, we propose a +Constrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto +optimal solutions that each solution is a set of hyperparameters achieving +optimal tradeoff between utility loss, training cost, and privacy leakage. We +design measurements of the three objectives. In particular, the privacy leakage +is measured using our proposed instance clustering attack. Experimental results +demonstrate that the CMOSB yields not only hyperparameters superior to the +baseline but also optimal sets of hyperparameters that can support the flexible +requirements of FL participants. + +
+
+
+
+
+ + ☆ Boosting Federated Learning Convergence with Prototype Regularization + + +
+ As a distributed machine learning technique, federated learning (FL) requires +clients to collaboratively train a shared model with an edge server without +leaking their local data. However, the heterogeneous data distribution among +clients often leads to a decrease in model performance. To tackle this issue, +this paper introduces a prototype-based regularization strategy to address the +heterogeneity in the data distribution. Specifically, the regularization +process involves the server aggregating local prototypes from distributed +clients to generate a global prototype, which is then sent back to the +individual clients to guide their local training. The experimental results on +MNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3% +and 8.9% in average test accuracy, respectively, compared to the most popular +baseline FedAvg. Furthermore, our approach has a fast convergence rate in +heterogeneous settings. + +
+
+
+
+
+ + ☆ Deceptive Alignment Monitoring ICML + + +
+ As the capabilities of large machine learning models continue to grow, and as +the autonomy afforded to such models continues to expand, the spectre of a new +adversary looms: the models themselves. The threat that a model might behave in +a seemingly reasonable manner, while secretly and subtly modifying its behavior +for ulterior reasons is often referred to as deceptive alignment in the AI +Safety & Alignment communities. Consequently, we call this new direction +Deceptive Alignment Monitoring. In this work, we identify emerging directions +in diverse machine learning subfields that we believe will become increasingly +important and intertwined in the near future for deceptive alignment +monitoring, and we argue that advances in these fields present both long-term +challenges and new research opportunities. We conclude by advocating for +greater involvement by the adversarial machine learning community in these +emerging directions. + +
+
+ comment: Accepted as BlueSky Oral to 2023 ICML AdvML Workshop +
+
+
+
+
+ + ☆ FACADE: A Framework for Adversarial Circuit Anomaly Detection and + Evaluation ICML + + +
+ We present FACADE, a novel probabilistic and geometric framework designed for +unsupervised mechanistic anomaly detection in deep neural networks. Its primary +goal is advancing the understanding and mitigation of adversarial attacks. +FACADE aims to generate probabilistic distributions over circuits, which +provide critical insights to their contribution to changes in the manifold +properties of pseudo-classes, or high-dimensional modes in activation space, +yielding a powerful tool for uncovering and combating adversarial attacks. Our +approach seeks to improve model robustness, enhance scalable model oversight, +and demonstrates promising applications in real-world deployment settings. + +
+
+ comment: Accepted as BlueSky Poster at 2023 ICML AdvML Workshop +
+
+
+
+
+ + ☆ Shared Adversarial Unlearning: Backdoor Mitigation by Unlearning Shared + Adversarial Examples + + +
+ Backdoor attacks are serious security threats to machine learning models +where an adversary can inject poisoned samples into the training set, causing a +backdoored model which predicts poisoned samples with particular triggers to +particular target classes, while behaving normally on benign samples. In this +paper, we explore the task of purifying a backdoored model using a small clean +dataset. By establishing the connection between backdoor risk and adversarial +risk, we derive a novel upper bound for backdoor risk, which mainly captures +the risk on the shared adversarial examples (SAEs) between the backdoored model +and the purified model. This upper bound further suggests a novel bi-level +optimization problem for mitigating backdoor using adversarial training +techniques. To solve it, we propose Shared Adversarial Unlearning (SAU). +Specifically, SAU first generates SAEs, and then, unlearns the generated SAEs +such that they are either correctly classified by the purified model and/or +differently classified by the two models, such that the backdoor effect in the +backdoored model will be mitigated in the purified model. Experiments on +various benchmark datasets and network architectures show that our proposed +method achieves state-of-the-art performance for backdoor defense. + +
+
+
+
+
+ + ☆ Post-variational quantum neural networks + + +
+ Quantum computing has the potential to provide substantial computational +advantages over current state-of-the-art classical supercomputers. However, +current hardware is not advanced enough to execute fault-tolerant quantum +algorithms. An alternative of using hybrid quantum-classical computing with +variational algorithms can exhibit barren plateau issues, causing slow +convergence of gradient-based optimization techniques. In this paper, we +discuss "post-variational strategies", which shift tunable parameters from the +quantum computer to the classical computer, opting for ensemble strategies when +optimizing quantum models. We discuss various strategies and design principles +for constructing individual quantum circuits, where the resulting ensembles can +be optimized with convex programming. Further, we discuss architectural designs +of post-variational quantum neural networks and analyze the propagation of +estimation errors throughout such neural networks. Lastly, we show that our +algorithm can be applied to real-world applications such as image +classification on handwritten digits, producing a 96% classification accuracy. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Air Traffic Controller Workload Level Prediction using Conformalized + Dynamical Graph Learning + + +
+ Air traffic control (ATC) is a safety-critical service system that demands +constant attention from ground air traffic controllers (ATCos) to maintain +daily aviation operations. The workload of the ATCos can have negative effects +on operational safety and airspace usage. To avoid overloading and ensure an +acceptable workload level for the ATCos, it is important to predict the ATCos' +workload accurately for mitigation actions. In this paper, we first perform a +review of research on ATCo workload, mostly from the air traffic perspective. +Then, we briefly introduce the setup of the human-in-the-loop (HITL) +simulations with retired ATCos, where the air traffic data and workload labels +are obtained. The simulations are conducted under three Phoenix approach +scenarios while the human ATCos are requested to self-evaluate their workload +ratings (i.e., low-1 to high-7). Preliminary data analysis is conducted. Next, +we propose a graph-based deep-learning framework with conformal prediction to +identify the ATCo workload levels. The number of aircraft under the +controller's control varies both spatially and temporally, resulting in +dynamically evolving graphs. The experiment results suggest that (a) besides +the traffic density feature, the traffic conflict feature contributes to the +workload prediction capabilities (i.e., minimum horizontal/vertical separation +distance); (b) directly learning from the spatiotemporal graph layout of +airspace with graph neural network can achieve higher prediction accuracy, +compare to hand-crafted traffic complexity features; (c) conformal prediction +is a valuable tool to further boost model prediction accuracy, resulting a +range of predicted workload labels. The code used is available at +\href{https://github.com/ymlasu/para-atm-collection/blob/master/air-traffic-prediction/ATC-Workload-Prediction/}{$\mathsf{Link}$}. + +
+
+
+
+
+ + ☆ SC VALL-E: Style-Controllable Zero-Shot Text to Speech Synthesizer + + +
+ Expressive speech synthesis models are trained by adding corpora with diverse +speakers, various emotions, and different speaking styles to the dataset, in +order to control various characteristics of speech and generate the desired +voice. In this paper, we propose a style control (SC) VALL-E model based on the +neural codec language model (called VALL-E), which follows the structure of the +generative pretrained transformer 3 (GPT-3). The proposed SC VALL-E takes input +from text sentences and prompt audio and is designed to generate controllable +speech by not simply mimicking the characteristics of the prompt audio but by +controlling the attributes to produce diverse voices. We identify tokens in the +style embedding matrix of the newly designed style network that represent +attributes such as emotion, speaking rate, pitch, and voice intensity, and +design a model that can control these attributes. To evaluate the performance +of SC VALL-E, we conduct comparative experiments with three representative +expressive speech synthesis models: global style token (GST) Tacotron2, +variational autoencoder (VAE) Tacotron2, and original VALL-E. We measure word +error rate (WER), F0 voiced error (FVE), and F0 gross pitch error (F0GPE) as +evaluation metrics to assess the accuracy of generated sentences. For comparing +the quality of synthesized speech, we measure comparative mean option score +(CMOS) and similarity mean option score (SMOS). To evaluate the style control +ability of the generated speech, we observe the changes in F0 and +mel-spectrogram by modifying the trained tokens. When using prompt audio that +is not present in the training data, SC VALL-E generates a variety of +expressive sounds and demonstrates competitive performance compared to the +existing models. Our implementation, pretrained models, and audio samples are +located on GitHub. + +
+
+
+
+
+ + ☆ Differentially Flat Learning-based Model Predictive Control Using a + Stability, State, and Input Constraining Safety Filter + + +
+ Learning-based optimal control algorithms control unknown systems using past +trajectory data and a learned model of the system dynamics. These controllers +use either a linear approximation of the learned dynamics, trading performance +for faster computation, or nonlinear optimization methods, which typically +perform better but can limit real-time applicability. In this work, we present +a novel nonlinear controller that exploits differential flatness to achieve +similar performance to state-of-the-art learning-based controllers but with +significantly less computational effort. Differential flatness is a property of +dynamical systems whereby nonlinear systems can be exactly linearized through a +nonlinear input mapping. Here, the nonlinear transformation is learned as a +Gaussian process and is used in a safety filter that guarantees, with high +probability, stability as well as input and flat state constraint satisfaction. +This safety filter is then used to refine inputs from a flat model predictive +controller to perform constrained nonlinear learning-based optimal control +through two successive convex optimizations. We compare our method to +state-of-the-art learning-based control strategies and achieve similar +performance, but with significantly better computational efficiency, while also +respecting flat state and input constraints, and guaranteeing stability. + +
+
+ comment: 6 pages, 5 figures, Published in IEEE Control Systems Letters +
+
+
+
+
+ + ☆ Fast Unsupervised Deep Outlier Model Selection with Hypernetworks + + +
+ Outlier detection (OD) finds many applications with a rich literature of +numerous techniques. Deep neural network based OD (DOD) has seen a recent surge +of attention thanks to the many advances in deep learning. In this paper, we +consider a critical-yet-understudied challenge with unsupervised DOD, that is, +effective hyperparameter (HP) tuning/model selection. While several prior work +report the sensitivity of OD models to HPs, it becomes ever so critical for the +modern DOD models that exhibit a long list of HPs. We introduce HYPER for +tuning DOD models, tackling two fundamental challenges: (1) validation without +supervision (due to lack of labeled anomalies), and (2) efficient search of the +HP/model space (due to exponential growth in the number of HPs). A key idea is +to design and train a novel hypernetwork (HN) that maps HPs onto optimal +weights of the main DOD model. In turn, HYPER capitalizes on a single HN that +can dynamically generate weights for many DOD models (corresponding to varying +HPs), which offers significant speed-up. In addition, it employs meta-learning +on historical OD tasks with labels to train a proxy validation function, +likewise trained with our proposed HN efficiently. Extensive experiments on 35 +OD tasks show that HYPER achieves high performance against 8 baselines with +significant efficiency gains. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Beyond Black-Box Advice: Learning-Augmented Algorithms for MDPs with + Q-Value Predictions + + +
+ We study the tradeoff between consistency and robustness in the context of a +single-trajectory time-varying Markov Decision Process (MDP) with untrusted +machine-learned advice. Our work departs from the typical approach of treating +advice as coming from black-box sources by instead considering a setting where +additional information about how the advice is generated is available. We prove +a first-of-its-kind consistency and robustness tradeoff given Q-value advice +under a general MDP model that includes both continuous and discrete +state/action spaces. Our results highlight that utilizing Q-value advice +enables dynamic pursuit of the better of machine-learned advice and a robust +baseline, thus result in near-optimal performance guarantees, which provably +improves what can be obtained solely with black-box advice. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ FedSoup: Improving Generalization and Personalization in Federated + Learning via Selective Model Interpolation MICCAI2023 + + +
+ Cross-silo federated learning (FL) enables the development of machine +learning models on datasets distributed across data centers such as hospitals +and clinical research laboratories. However, recent research has found that +current FL algorithms face a trade-off between local and global performance +when confronted with distribution shifts. Specifically, personalized FL methods +have a tendency to overfit to local data, leading to a sharp valley in the +local model and inhibiting its ability to generalize to out-of-distribution +data. In this paper, we propose a novel federated model soup method (i.e., +selective interpolation of model parameters) to optimize the trade-off between +local and global performance. Specifically, during the federated training +phase, each client maintains its own global model pool by monitoring the +performance of the interpolated model between the local and global models. This +allows us to alleviate overfitting and seek flat minima, which can +significantly improve the model's generalization performance. We evaluate our +method on retinal and pathological image classification tasks, and our proposed +method achieves significant improvements for out-of-distribution +generalization. Our code is available at https://github.com/ubc-tea/FedSoup. + +
+
+ comment: Accepted by MICCAI2023 +
+
+
+
+
+ + ☆ Identifying Interpretable Subspaces in Image Representations ICML 2023 + + +
+ We propose Automatic Feature Explanation using Contrasting Concepts (FALCON), +an interpretability framework to explain features of image representations. For +a target feature, FALCON captions its highly activating cropped images using a +large captioning dataset (like LAION-400m) and a pre-trained vision-language +model like CLIP. Each word among the captions is scored and ranked leading to a +small number of shared, human-understandable concepts that closely describe the +target feature. FALCON also applies contrastive interpretation using lowly +activating (counterfactual) images, to eliminate spurious concepts. Although +many existing approaches interpret features independently, we observe in +state-of-the-art self-supervised and supervised models, that less than 20% of +the representation space can be explained by individual features. We show that +features in larger spaces become more interpretable when studied in groups and +can be explained with high-order scoring concepts through FALCON. We discuss +how extracted concepts can be used to explain and debug failures in downstream +tasks. Finally, we present a technique to transfer concepts from one +(explainable) representation space to another unseen representation space by +learning a simple linear transformation. + +
+
+ comment: Published at ICML 2023 +
+
+
+
+
+ + ☆ GLSFormer: Gated - Long, Short Sequence Transformer for Step Recognition + in Surgical Videos MICCAI 2023 + + +
+ Automated surgical step recognition is an important task that can +significantly improve patient safety and decision-making during surgeries. +Existing state-of-the-art methods for surgical step recognition either rely on +separate, multi-stage modeling of spatial and temporal information or operate +on short-range temporal resolution when learned jointly. However, the benefits +of joint modeling of spatio-temporal features and long-range information are +not taken in account. In this paper, we propose a vision transformer-based +approach to jointly learn spatio-temporal features directly from sequence of +frame-level patches. Our method incorporates a gated-temporal attention +mechanism that intelligently combines short-term and long-term spatio-temporal +feature representations. We extensively evaluate our approach on two cataract +surgery video datasets, namely Cataract-101 and D99, and demonstrate superior +performance compared to various state-of-the-art methods. These results +validate the suitability of our proposed approach for automated surgical step +recognition. Our code is released at: +https://github.com/nisargshah1999/GLSFormer + +
+
+ comment: Accepted to MICCAI 2023 (Early Accept) +
+
+
+
+
+ + ☆ Amortized Variational Inference: When and Why? + + +
+ Amortized variational inference (A-VI) is a method for approximating the +intractable posterior distributions that arise in probabilistic models. The +defining feature of A-VI is that it learns a global inference function that +maps each observation to its local latent variable's approximate posterior. +This stands in contrast to the more classical factorized (or mean-field) +variational inference (F-VI), which directly learns the parameters of the +approximating distribution for each latent variable. In deep generative models, +A-VI is used as a computational trick to speed up inference for local latent +variables. In this paper, we study A-VI as a general alternative to F-VI for +approximate posterior inference. A-VI cannot produce an approximation with a +lower Kullback-Leibler divergence than F-VI's optimal solution, because the +amortized family is a subset of the factorized family. Thus a central +theoretical problem is to characterize when A-VI still attains F-VI's optimal +solution. We derive conditions on both the model and the inference function +under which A-VI can theoretically achieve F-VI's optimum. We show that for a +broad class of hierarchical models, including deep generative models, it is +possible to close the gap between A-VI and F-VI. Further, for an even broader +class of models, we establish when and how to expand the domain of the +inference function to make amortization a feasible strategy. Finally, we prove +that for certain models -- including hidden Markov models and Gaussian +processes -- A-VI cannot match F-VI's solution, no matter how expressive the +inference function is. We also study A-VI empirically. On several examples, we +corroborate our theoretical results and investigate the performance of A-VI +when varying the complexity of the inference function. When the gap between +A-VI and F-VI can be closed, we find that the required complexity of the +function need not scale with the number of observations, and that A-VI often +converges faster than F-VI. + +
+
+
+
+
+ + ☆ On the Fisher-Rao Gradient of the Evidence Lower Bound + + +
+ This article studies the Fisher-Rao gradient, also referred to as the natural +gradient, of the evidence lower bound, the ELBO, which plays a crucial role +within the theory of the Variational Autonecoder, the Helmholtz Machine and the +Free Energy Principle. The natural gradient of the ELBO is related to the +natural gradient of the Kullback-Leibler divergence from a target distribution, +the prime objective function of learning. Based on invariance properties of +gradients within information geometry, conditions on the underlying model are +provided that ensure the equivalence of minimising the prime objective function +and the maximisation of the ELBO. + +
+
+
+
+
+ + ☆ On-Sensor Data Filtering using Neuromorphic Computing for High Energy + Physics Experiments + + +
+ This work describes the investigation of neuromorphic computing-based spiking +neural network (SNN) models used to filter data from sensor electronics in high +energy physics experiments conducted at the High Luminosity Large Hadron +Collider. We present our approach for developing a compact neuromorphic model +that filters out the sensor data based on the particle's transverse momentum +with the goal of reducing the amount of data being sent to the downstream +electronics. The incoming charge waveforms are converted to streams of +binary-valued events, which are then processed by the SNN. We present our +insights on the various system design choices - from data encoding to optimal +hyperparameters of the training algorithm - for an accurate and compact SNN +optimized for hardware deployment. Our results show that an SNN trained with an +evolutionary algorithm and an optimized set of hyperparameters obtains a signal +efficiency of about 91% with nearly half as many parameters as a deep neural +network. + +
+
+ comment: Manuscript accepted at ICONS'23 +
+
+
+
+
+ + ☆ Edgewise outliers of network indexed signals + + +
+ We consider models for network indexed multivariate data involving a +dependence between variables as well as across graph nodes. + In the framework of these models, we focus on outliers detection and +introduce the concept of edgewise outliers. For this purpose, we first derive +the distribution of some sums of squares, in particular squared Mahalanobis +distances that can be used to fix detection rules and thresholds for outlier +detection. We then propose a robust version of the deterministic MCD algorithm +that we call edgewise MCD. An application on simulated data shows the interest +of taking the dependence structure into account. We also illustrate the utility +of the proposed method with a real data set. + +
+
+
+
+
+ + ☆ QDC: Quantum Diffusion Convolution Kernels on Graphs + + +
+ Graph convolutional neural networks (GCNs) operate by aggregating messages +over local neighborhoods given the prediction task under interest. Many GCNs +can be understood as a form of generalized diffusion of input features on the +graph, and significant work has been dedicated to improving predictive accuracy +by altering the ways of message passing. In this work, we propose a new +convolution kernel that effectively rewires the graph according to the +occupation correlations of the vertices by trading on the generalized diffusion +paradigm for the propagation of a quantum particle over the graph. We term this +new convolution kernel the Quantum Diffusion Convolution (QDC) operator. In +addition, we introduce a multiscale variant that combines messages from the QDC +operator and the traditional combinatorial Laplacian. To understand our method, +we explore the spectral dependence of homophily and the importance of quantum +dynamics in the construction of a bandpass filter. Through these studies, as +well as experiments on a range of datasets, we observe that QDC improves +predictive performance on the widely used benchmark datasets when compared to +similar methods. + +
+
+
+
+
+ + ☆ From Adaptive Query Release to Machine Unlearning ICML 2023 + + +
+ We formalize the problem of machine unlearning as design of efficient +unlearning algorithms corresponding to learning algorithms which perform a +selection of adaptive queries from structured query classes. We give efficient +unlearning algorithms for linear and prefix-sum query classes. As applications, +we show that unlearning in many problems, in particular, stochastic convex +optimization (SCO), can be reduced to the above, yielding improved guarantees +for the problem. In particular, for smooth Lipschitz losses and any $\rho>0$, +our results yield an unlearning algorithm with excess population risk of +$\tilde O\big(\frac{1}{\sqrt{n}}+\frac{\sqrt{d}}{n\rho}\big)$ with unlearning +query (gradient) complexity $\tilde O(\rho \cdot \text{Retraining +Complexity})$, where $d$ is the model dimensionality and $n$ is the initial +number of samples. For non-smooth Lipschitz losses, we give an unlearning +algorithm with excess population risk $\tilde +O\big(\frac{1}{\sqrt{n}}+\big(\frac{\sqrt{d}}{n\rho}\big)^{1/2}\big)$ with the +same unlearning query (gradient) complexity. Furthermore, in the special case +of Generalized Linear Models (GLMs), such as those in linear and logistic +regression, we get dimension-independent rates of $\tilde +O\big(\frac{1}{\sqrt{n}} +\frac{1}{(n\rho)^{2/3}}\big)$ and $\tilde +O\big(\frac{1}{\sqrt{n}} +\frac{1}{(n\rho)^{1/3}}\big)$ for smooth Lipschitz +and non-smooth Lipschitz losses respectively. Finally, we give generalizations +of the above from one unlearning request to \textit{dynamic} streams consisting +of insertions and deletions. + +
+
+ comment: Accepted to ICML 2023 +
+
+
+
+
+ + ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding + Models EMNLP 2023 + + +
+ Jina Embeddings constitutes a set of high-performance sentence embedding +models adept at translating various textual inputs into numerical +representations, thereby capturing the semantic essence of the text. While +these models are not exclusively designed for text generation, they excel in +applications such as dense retrieval and semantic textual similarity. This +paper details the development of Jina Embeddings, starting with the creation of +a high-quality pairwise and triplet dataset. It underlines the crucial role of +data cleaning in dataset preparation, gives in-depth insights into the model +training process, and concludes with a comprehensive performance evaluation +using the Massive Textual Embedding Benchmark (MTEB). + +
+
+ comment: 9 pages, 2 page appendix, EMNLP 2023 Industrial Track +
+
+
+
+
+ + ☆ FairMobi-Net: A Fairness-aware Deep Learning Model for Urban Mobility + Flow Generation + + +
+ Generating realistic human flows across regions is essential for our +understanding of urban structures and population activity patterns, enabling +important applications in the fields of urban planning and management. However, +a notable shortcoming of most existing mobility generation methodologies is +neglect of prediction fairness, which can result in underestimation of mobility +flows across regions with vulnerable population groups, potentially resulting +in inequitable resource distribution and infrastructure development. To +overcome this limitation, our study presents a novel, fairness-aware deep +learning model, FairMobi-Net, for inter-region human flow prediction. The +FairMobi-Net model uniquely incorporates fairness loss into the loss function +and employs a hybrid approach, merging binary classification and numerical +regression techniques for human flow prediction. We validate the FairMobi-Net +model using comprehensive human mobility datasets from four U.S. cities, +predicting human flow at the census-tract level. Our findings reveal that the +FairMobi-Net model outperforms state-of-the-art models (such as the DeepGravity +model) in producing more accurate and equitable human flow predictions across a +variety of region pairs, regardless of regional income differences. The model +maintains a high degree of accuracy consistently across diverse regions, +addressing the previous fairness concern. Further analysis of feature +importance elucidates the impact of physical distances and road network +structures on human flows across regions. With fairness as its touchstone, the +model and results provide researchers and practitioners across the fields of +urban sciences, transportation engineering, and computing with an effective +tool for accurate generation of human mobility flows across regions. + +
+
+
+
+
+ + ☆ The Effect of Epidemiological Cohort Creation on the Machine Learning + Prediction of Homelessness and Police Interaction Outcomes Using + Administrative Health Care Data + + +
+ Background: Mental illness can lead to adverse outcomes such as homelessness +and police interaction and understanding of the events leading up to these +adverse outcomes is important. Predictive models may help identify individuals +at risk of such adverse outcomes. Using a fixed observation window cohort with +logistic regression (LR) or machine learning (ML) models can result in lower +performance when compared with adaptive and parcellated windows. Method: An +administrative healthcare dataset was used, comprising of 240,219 individuals +in Calgary, Alberta, Canada who were diagnosed with addiction or mental health +(AMH) between April 1, 2013, and March 31, 2018. The cohort was followed for 2 +years to identify factors associated with homelessness and police interactions. +To understand the benefit of flexible windows to predictive models, an +alternative cohort was created. Then LR and ML models, including random forests +(RF), and extreme gradient boosting (XGBoost) were compared in the two cohorts. +Results: Among 237,602 individuals, 0.8% (1,800) experienced first +homelessness, while 0.32% (759) reported initial police interaction among +237,141 individuals. Male sex (AORs: H=1.51, P=2.52), substance disorder (AORs: +H=3.70, P=2.83), psychiatrist visits (AORs: H=1.44, P=1.49), and drug abuse +(AORs: H=2.67, P=1.83) were associated with initial homelessness (H) and police +interaction (P). XGBoost showed superior performance using the flexible method +(sensitivity =91%, AUC =90% for initial homelessness, and sensitivity =90%, +AUC=89% for initial police interaction) + Conclusion: This study identified key features associated with initial +homelessness and police interaction and demonstrated that flexible windows can +improve predictive modeling. + +
+
+ comment: to be published in Frontiers in Digital Health, Health Informatics +
+
+
+
+
+ + ☆ Clinical Trial Active Learning + + +
+ This paper presents a novel approach to active learning that takes into +account the non-independent and identically distributed (non-i.i.d.) structure +of a clinical trial setting. There exists two types of clinical trials: +retrospective and prospective. Retrospective clinical trials analyze data after +treatment has been performed; prospective clinical trials collect data as +treatment is ongoing. Typically, active learning approaches assume the dataset +is i.i.d. when selecting training samples; however, in the case of clinical +trials, treatment results in a dependency between the data collected at the +current and past visits. Thus, we propose prospective active learning to +overcome the limitations present in traditional active learning methods and +apply it to disease detection in optical coherence tomography (OCT) images, +where we condition on the time an image was collected to enforce the i.i.d. +assumption. We compare our proposed method to the traditional active learning +paradigm, which we refer to as retrospective in nature. We demonstrate that +prospective active learning outperforms retrospective active learning in two +different types of test settings. + +
+
+ comment: Accepted at 14th ACM International Conference on Bioinformatics, + Computational Biology and Health Informatics (ACM-BCB) +
+
+
+
+
+ + ☆ Heuristic Hyperparameter Choice for Image Anomaly Detection + + +
+ Anomaly detection (AD) in images is a fundamental computer vision problem by +deep learning neural network to identify images deviating significantly from +normality. The deep features extracted from pretrained models have been proved +to be essential for AD based on multivariate Gaussian distribution analysis. +However, since models are usually pretrained on a large dataset for +classification tasks such as ImageNet, they might produce lots of redundant +features for AD, which increases computational cost and degrades the +performance. We aim to do the dimension reduction of Negated Principal +Component Analysis (NPCA) for these features. So we proposed some heuristic to +choose hyperparameter of NPCA algorithm for getting as fewer components of +features as possible while ensuring a good performance. + +
+
+
+
+
+ + ☆ Exploring reinforcement learning techniques for discrete and continuous + control tasks in the MuJoCo environment + + +
+ We leverage the fast physics simulator, MuJoCo to run tasks in a continuous +control environment and reveal details like the observation space, action +space, rewards, etc. for each task. We benchmark value-based methods for +continuous control by comparing Q-learning and SARSA through a discretization +approach, and using them as baselines, progressively moving into one of the +state-of-the-art deep policy gradient method DDPG. Over a large number of +episodes, Qlearning outscored SARSA, but DDPG outperformed both in a small +number of episodes. Lastly, we also fine-tuned the model hyper-parameters +expecting to squeeze more performance but using lesser time and resources. We +anticipated that the new design for DDPG would vastly improve performance, yet +after only a few episodes, we were able to achieve decent average rewards. We +expect to improve the performance provided adequate time and computational +resources. + +
+
+ comment: Released @ Dec 2021. For associated project files, see + https://github.com/chakrabortyde/mujoco-control-tasks +
+
+
+
+
+ + ♻ ☆ Frequency Domain Adversarial Training for Robust Volumetric Medical + Segmentation MICCAI 2023 + + +
+ It is imperative to ensure the robustness of deep learning models in critical +applications such as, healthcare. While recent advances in deep learning have +improved the performance of volumetric medical image segmentation models, these +models cannot be deployed for real-world applications immediately due to their +vulnerability to adversarial attacks. We present a 3D frequency domain +adversarial attack for volumetric medical image segmentation models and +demonstrate its advantages over conventional input or voxel domain attacks. +Using our proposed attack, we introduce a novel frequency domain adversarial +training approach for optimizing a robust model against voxel and frequency +domain attacks. Moreover, we propose frequency consistency loss to regulate our +frequency domain adversarial training that achieves a better tradeoff between +model's performance on clean and adversarial samples. Code is publicly +available at https://github.com/asif-hanif/vafa. + +
+
+ comment: This paper has been accepted in MICCAI 2023 conference +
+
+
+
+
+ + ♻ ☆ Mathematical Capabilities of ChatGPT + + +
+ We investigate the mathematical capabilities of two iterations of ChatGPT +(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on +publicly available datasets, as well as hand-crafted ones, using a novel +methodology. In contrast to formal mathematics, where large databases of formal +proofs are available (e.g., the Lean Mathematical Library), current datasets of +natural-language mathematics, used to benchmark language models, either cover +only elementary mathematics or are very small. We address this by publicly +releasing two new datasets: GHOSTS and miniGHOSTS. These are the first +natural-language datasets curated by working researchers in mathematics that +(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of +the mathematical capabilities of language models, and (3) distinguish multiple +dimensions of mathematical reasoning. These datasets also test whether ChatGPT +and GPT-4 can be helpful assistants to professional mathematicians by emulating +use cases that arise in the daily professional activities of mathematicians. We +benchmark the models on a range of fine-grained performance metrics. For +advanced mathematics, this is the most detailed evaluation effort to date. We +find that ChatGPT can be used most successfully as a mathematical assistant for +querying facts, acting as a mathematical search engine and knowledge base +interface. GPT-4 can additionally be used for undergraduate-level mathematics +but fails on graduate-level difficulty. Contrary to many positive reports in +the media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of +selection bias), their overall mathematical performance is well below the level +of a graduate student. Hence, if your goal is to use ChatGPT to pass a +graduate-level math exam, you would be better off copying from your average +peer! + +
+
+ comment: Added further evaluations on another ChatGPT version and on GPT-4. + The GHOSTS and miniGHOSTS datasets are available at + https://github.com/xyfrieder/science-GHOSTS +
+
+
+
+
+ + ♻ ☆ Torchhd: An Open Source Python Library to Support Research on + Hyperdimensional Computing and Vector Symbolic Architectures + + +
+ Hyperdimensional computing (HD), also known as vector symbolic architectures +(VSA), is a framework for computing with distributed representations by +exploiting properties of random high-dimensional vector spaces. The commitment +of the scientific community to aggregate and disseminate research in this +particularly multidisciplinary area has been fundamental for its advancement. +Joining these efforts, we present Torchhd, a high-performance open source +Python library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and +serves as an efficient foundation for further research and application +development. The easy-to-use library builds on top of PyTorch and features +state-of-the-art HD/VSA functionality, clear documentation, and implementation +examples from well-known publications. Comparing publicly available code with +their corresponding Torchhd implementation shows that experiments can run up to +100x faster. Torchhd is available at: +https://github.com/hyperdimensional-computing/torchhd. + +
+
+
+
+
+ + ♻ ☆ Understanding Uncertainty Sampling + + +
+ Uncertainty sampling is a prevalent active learning algorithm that queries +sequentially the annotations of data samples which the current prediction model +is uncertain about. However, the usage of uncertainty sampling has been largely +heuristic: (i) There is no consensus on the proper definition of "uncertainty" +for a specific task under a specific loss; (ii) There is no theoretical +guarantee that prescribes a standard protocol to implement the algorithm, for +example, how to handle the sequentially arrived annotated data under the +framework of optimization algorithms such as stochastic gradient descent. In +this work, we systematically examine uncertainty sampling algorithms under both +stream-based and pool-based active learning. We propose a notion of equivalent +loss which depends on the used uncertainty measure and the original loss +function and establish that an uncertainty sampling algorithm essentially +optimizes against such an equivalent loss. The perspective verifies the +properness of existing uncertainty measures from two aspects: surrogate +property and loss convexity. Furthermore, we propose a new notion for designing +uncertainty measures called \textit{loss as uncertainty}. The idea is to use +the conditional expected loss given the features as the uncertainty measure. +Such an uncertainty measure has nice analytical properties and generality to +cover both classification and regression problems, which enable us to provide +the first generalization bound for uncertainty sampling algorithms under both +stream-based and pool-based settings, in the full generality of the underlying +model and problem. Lastly, we establish connections between certain variants of +the uncertainty sampling algorithms with risk-sensitive objectives and +distributional robustness, which can partly explain the advantage of +uncertainty sampling algorithms when the sample size is small. + +
+
+ comment: Update: add numerical illustrations and experiments; correct some + typos and modify the numbering +
+
+
+
+
+ + ♻ ☆ Can point cloud networks learn statistical shape models of anatomies? MICCAI 2023 + + +
+ Statistical Shape Modeling (SSM) is a valuable tool for investigating and +quantifying anatomical variations within populations of anatomies. However, +traditional correspondence-based SSM generation methods have a prohibitive +inference process and require complete geometric proxies (e.g., high-resolution +binary volumes or surface meshes) as input shapes to construct the SSM. +Unordered 3D point cloud representations of shapes are more easily acquired +from various medical imaging practices (e.g., thresholded images and surface +scanning). Point cloud deep networks have recently achieved remarkable success +in learning permutation-invariant features for different point cloud tasks +(e.g., completion, semantic segmentation, classification). However, their +application to learning SSM from point clouds is to-date unexplored. In this +work, we demonstrate that existing point cloud encoder-decoder-based completion +networks can provide an untapped potential for SSM, capturing population-level +statistical representations of shapes while reducing the inference burden and +relaxing the input requirement. We discuss the limitations of these techniques +to the SSM application and suggest future improvements. Our work paves the way +for further exploration of point cloud deep learning for SSM, a promising +avenue for advancing shape analysis literature and broadening SSM to diverse +use cases. + +
+
+ comment: Accepted to MICCAI 2023. 13 pages, 5 figures, appendix +
+
+
+
+
+ + ♻ ☆ Data-Driven Modeling of Noise Time Series with Convolutional Generative + Adversarial Networks + + +
+ Random noise arising from physical processes is an inherent characteristic of +measurements and a limiting factor for most signal processing and data analysis +tasks. Given the recent interest in generative adversarial networks (GANs) for +data-driven modeling, it is important to determine to what extent GANs can +faithfully reproduce noise in target data sets. In this paper, we present an +empirical investigation that aims to shed light on this issue for time series. +Namely, we assess two general-purpose GANs for time series that are based on +the popular deep convolutional GAN (DCGAN) architecture, a direct time-series +model and an image-based model that uses a short-time Fourier transform (STFT) +data representation. The GAN models are trained and quantitatively evaluated +using distributions of simulated noise time series with known ground-truth +parameters. Target time series distributions include a broad range of noise +types commonly encountered in physical measurements, electronics, and +communication systems: band-limited thermal noise, power law noise, shot noise, +and impulsive noise. We find that GANs are capable of learning many noise +types, although they predictably struggle when the GAN architecture is not well +suited to some aspects of the noise, e.g., impulsive time-series with extreme +outliers. Our findings provide insights into the capabilities and potential +limitations of current approaches to time-series GANs and highlight areas for +further research. In addition, our battery of tests provides a useful benchmark +to aid the development of deep generative models for time series. + +
+
+ comment: 27 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Variational Mixture of HyperGenerators for Learning Distributions Over + Functions ICML 2023 + + +
+ Recent approaches build on implicit neural representations (INRs) to propose +generative models over function spaces. However, they are computationally +costly when dealing with inference tasks, such as missing data imputation, or +directly cannot tackle them. In this work, we propose a novel deep generative +model, named VAMoH. VAMoH combines the capabilities of modeling continuous +functions using INRs and the inference capabilities of Variational Autoencoders +(VAEs). In addition, VAMoH relies on a normalizing flow to define the prior, +and a mixture of hypernetworks to parametrize the data log-likelihood. This +gives VAMoH a high expressive capability and interpretability. Through +experiments on a diverse range of data types, such as images, voxels, and +climate data, we show that VAMoH can effectively learn rich distributions over +continuous functions. Furthermore, it can perform inference-related tasks, such +as conditional super-resolution generation and in-painting, as well or better +than previous approaches, while being less computationally demanding. + +
+
+ comment: Accepted at ICML 2023. Camera ready version +
+
+
+
+
+ + ♻ ☆ Perceptron Theory Can Predict the Accuracy of Neural Networks + + +
+ Multilayer neural networks set the current state of the art for many +technical classification problems. But, these networks are still, essentially, +black boxes in terms of analyzing them and predicting their performance. Here, +we develop a statistical theory for the one-layer perceptron and show that it +can predict performances of a surprisingly large variety of neural networks +with different architectures. A general theory of classification with +perceptrons is developed by generalizing an existing theory for analyzing +reservoir computing models and connectionist models for symbolic reasoning +known as vector symbolic architectures. Our statistical theory offers three +formulas leveraging the signal statistics with increasing detail. The formulas +are analytically intractable, but can be evaluated numerically. The description +level that captures maximum details requires stochastic sampling methods. +Depending on the network model, the simpler formulas already yield high +prediction accuracy. The quality of the theory predictions is assessed in three +experimental settings, a memorization task for echo state networks (ESNs) from +reservoir computing literature, a collection of classification datasets for +shallow randomly connected networks, and the ImageNet dataset for deep +convolutional neural networks. We find that the second description level of the +perceptron theory can predict the performance of types of ESNs, which could not +be described previously. The theory can predict deep multilayer neural networks +by being applied to their output layer. While other methods for prediction of +neural networks performance commonly require to train an estimator model, the +proposed theory requires only the first two moments of the distribution of the +postsynaptic sums in the output neurons. The perceptron theory compares +favorably to other methods that do not rely on training an estimator model. + +
+
+ comment: 16 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Tuning Stochastic Gradient Algorithms for Statistical Inference via + Large-Sample Asymptotics + + +
+ The tuning of stochastic gradient algorithms (SGAs) for optimization and +sampling is often based on heuristics and trial-and-error rather than +generalizable theory. We address this theory--practice gap by characterizing +the large-sample statistical asymptotics of SGAs via a joint +step-size--sample-size scaling limit. We show that iterate averaging with a +large fixed step size is robust to the choice of tuning parameters and +asymptotically has covariance proportional to that of the MLE sampling +distribution. We also prove a Bernstein--von Mises-like theorem to guide +tuning, including for generalized posteriors that are robust to model +misspecification. Numerical experiments validate our results and +recommendations in realistic finite-sample regimes. Our work lays the +foundation for a systematic analysis of other stochastic gradient Markov chain +Monte Carlo algorithms for a wide range of models. + +
+
+ comment: 42 pgs +
+
+
+
+
+ + ♻ ☆ Impatient Bandits: Optimizing Recommendations for the Long-Term Without + Delay KDD + + +
+ Recommender systems are a ubiquitous feature of online platforms. +Increasingly, they are explicitly tasked with increasing users' long-term +satisfaction. In this context, we study a content exploration task, which we +formalize as a multi-armed bandit problem with delayed rewards. We observe that +there is an apparent trade-off in choosing the learning signal: Waiting for the +full reward to become available might take several weeks, hurting the rate at +which learning happens, whereas measuring short-term proxy rewards reflects the +actual long-term goal only imperfectly. We address this challenge in two steps. +First, we develop a predictive model of delayed rewards that incorporates all +information obtained to date. Full observations as well as partial (short or +medium-term) outcomes are combined through a Bayesian filter to obtain a +probabilistic belief. Second, we devise a bandit algorithm that takes advantage +of this new predictive model. The algorithm quickly learns to identify content +aligned with long-term success by carefully balancing exploration and +exploitation. We apply our approach to a podcast recommendation problem, where +we seek to identify shows that users engage with repeatedly over two months. We +empirically validate that our approach results in substantially better +performance compared to approaches that either optimize for short-term proxies, +or wait for the long-term outcome to be fully realized. + +
+
+ comment: Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery + and Data Mining (KDD '23) +
+
+
+
+
+ + ♻ ☆ Improving Code Example Recommendations on Informal Documentation Using + BERT and Query-Aware LSH: A Comparative Study + + +
+ Our research investigates the recommendation of code examples to aid software +developers, a practice that saves developers significant time by providing +ready-to-use code snippets. The focus of our study is Stack Overflow, a +commonly used resource for coding discussions and solutions, particularly in +the context of the Java programming language. We applied BERT, a powerful Large +Language Model (LLM) that enables us to transform code examples into numerical +vectors by extracting their semantic information. Once these numerical +representations are prepared, we identify Approximate Nearest Neighbors (ANN) +using Locality-Sensitive Hashing (LSH). Our research employed two variants of +LSH: Random Hyperplane-based LSH and Query-Aware LSH. We rigorously compared +these two approaches across four parameters: HitRate, Mean Reciprocal Rank +(MRR), Average Execution Time, and Relevance. Our study revealed that the +Query-Aware (QA) approach showed superior performance over the Random +Hyperplane-based (RH) method. Specifically, it exhibited a notable improvement +of 20% to 35% in HitRate for query pairs compared to the RH approach. +Furthermore, the QA approach proved significantly more time-efficient, with its +speed in creating hashing tables and assigning data samples to buckets being at +least four times faster. It can return code examples within milliseconds, +whereas the RH approach typically requires several seconds to recommend code +examples. Due to the superior performance of the QA approach, we tested it +against PostFinder and FaCoY, the state-of-the-art baselines. Our QA method +showed comparable efficiency proving its potential for effective code +recommendation. + +
+
+
+
+
+ + ♻ ☆ Class-Incremental Learning based on Label Generation ACL 2023 + + +
+ Despite the great success of pre-trained language models, it is still a +challenge to use these models for continual learning, especially for the +class-incremental learning (CIL) setting due to catastrophic forgetting (CF). +This paper reports our finding that if we formulate CIL as a continual label +generation problem, CF is drastically reduced and the generalizable +representations of pre-trained models can be better retained. We thus propose a +new CIL method (VAG) that also leverages the sparsity of vocabulary to focus +the generation and creates pseudo-replay samples by using label semantics. +Experimental results show that VAG outperforms baselines by a large margin. + +
+
+ comment: 12 pages, ACL 2023 Main Conference +
+
+
+
+
+ + ♻ ☆ When are Local Queries Useful for Robust Learning? NeurIPS 2022 + + +
+ Distributional assumptions have been shown to be necessary for the robust +learnability of concept classes when considering the exact-in-the-ball robust +risk and access to random examples by Gourdeau et al. (2019). In this paper, we +study learning models where the learner is given more power through the use of +local queries, and give the first distribution-free algorithms that perform +robust empirical risk minimization (ERM) for this notion of robustness. The +first learning model we consider uses local membership queries (LMQ), where the +learner can query the label of points near the training sample. We show that, +under the uniform distribution, LMQs do not increase the robustness threshold +of conjunctions and any superclass, e.g., decision lists and halfspaces. Faced +with this negative result, we introduce the local equivalence query +($\mathsf{LEQ}$) oracle, which returns whether the hypothesis and target +concept agree in the perturbation region around a point in the training sample, +as well as a counterexample if it exists. We show a separation result: on the +one hand, if the query radius $\lambda$ is strictly smaller than the +adversary's perturbation budget $\rho$, then distribution-free robust learning +is impossible for a wide variety of concept classes; on the other hand, the +setting $\lambda=\rho$ allows us to develop robust ERM algorithms. We then +bound the query complexity of these algorithms based on online learning +guarantees and further improve these bounds for the special case of +conjunctions. We finish by giving robust learning algorithms for halfspaces on +$\{0,1\}^n$ and then obtaining robustness guarantees for halfspaces in +$\mathbb{R}^n$ against precision-bounded adversaries. + +
+
+ comment: Accepted to NeurIPS 2022; V2 contains new results (Section 3.6) and + an erratum from the previous version (Appendix C) +
+
+
+
+
+ + ♻ ☆ A Review of Machine Learning Methods Applied to Structural Dynamics and + Vibroacoustic + + +
+ The use of Machine Learning (ML) has rapidly spread across several fields, +having encountered many applications in Structural Dynamics and Vibroacoustic +(SD\&V). The increasing capabilities of ML to unveil insights from data, driven +by unprecedented data availability, algorithms advances and computational +power, enhance decision making, uncertainty handling, patterns recognition and +real-time assessments. Three main applications in SD\&V have taken advantage of +these benefits. In Structural Health Monitoring, ML detection and prognosis +lead to safe operation and optimized maintenance schedules. System +identification and control design are leveraged by ML techniques in Active +Noise Control and Active Vibration Control. Finally, the so-called ML-based +surrogate models provide fast alternatives to costly simulations, enabling +robust and optimized product design. Despite the many works in the area, they +have not been reviewed and analyzed. Therefore, to keep track and understand +this ongoing integration of fields, this paper presents a survey of ML +applications in SD\&V analyses, shedding light on the current state of +implementation and emerging opportunities. The main methodologies, advantages, +limitations, and recommendations based on scientific knowledge were identified +for each of the three applications. Moreover, the paper considers the role of +Digital Twins and Physics Guided ML to overcome current challenges and power +future research progress. As a result, the survey provides a broad overview of +the present landscape of ML applied in SD\&V and guides the reader to an +advanced understanding of progress and prospects in the field. + +
+
+
+
+
+ + ♻ ☆ Gaussian Process Priors for Systems of Linear Partial Differential + Equations with Constant Coefficients ICML 2023 + + +
+ Partial differential equations (PDEs) are important tools to model physical +systems and including them into machine learning models is an important way of +incorporating physical knowledge. Given any system of linear PDEs with constant +coefficients, we propose a family of Gaussian process (GP) priors, which we +call EPGP, such that all realizations are exact solutions of this system. We +apply the Ehrenpreis-Palamodov fundamental principle, which works as a +non-linear Fourier transform, to construct GP kernels mirroring standard +spectral methods for GPs. Our approach can infer probable solutions of linear +PDE systems from any data such as noisy measurements, or pointwise defined +initial and boundary conditions. Constructing EPGP-priors is algorithmic, +generally applicable, and comes with a sparse version (S-EPGP) that learns the +relevant spectral frequencies and works better for big data sets. We +demonstrate our approach on three families of systems of PDEs, the heat +equation, wave equation, and Maxwell's equations, where we improve upon the +state of the art in computation time and precision, in some experiments by +several orders of magnitude. + +
+
+ comment: 26 pages, 8 figures; ICML 2023 (oral); updated with expanded + appendices and ancillary files. Code available at + https://github.com/haerski/EPGP. For animations, see + https://mathrepo.mis.mpg.de/EPGP/index.html +
+
+
+
+
+ + ♻ ☆ Provably Efficient UCB-type Algorithms For Learning Predictive State + Representations + + +
+ The general sequential decision-making problem, which includes Markov +decision processes (MDPs) and partially observable MDPs (POMDPs) as special +cases, aims at maximizing a cumulative reward by making a sequence of decisions +based on a history of observations and actions over time. Recent studies have +shown that the sequential decision-making problem is statistically learnable if +it admits a low-rank structure modeled by predictive state representations +(PSRs). Despite these advancements, existing approaches typically involve +oracles or steps that are not computationally efficient. On the other hand, the +upper confidence bound (UCB) based approaches, which have served successfully +as computationally efficient methods in bandits and MDPs, have not been +investigated for more general PSRs, due to the difficulty of optimistic bonus +design in these more challenging settings. This paper proposes the first known +UCB-type approach for PSRs, featuring a novel bonus term that upper bounds the +total variation distance between the estimated and true models. We further +characterize the sample complexity bounds for our designed UCB-type algorithms +for both online and offline PSRs. In contrast to existing approaches for PSRs, +our UCB-type algorithms enjoy computational efficiency, last-iterate guaranteed +near-optimal policy, and guaranteed model accuracy. + +
+
+
+
+
+ + ♻ ☆ High-order Tensor Pooling with Attention for Action Recognition + + +
+ We aim at capturing high-order statistics of feature vectors formed by a +neural network, and propose end-to-end second- and higher-order pooling to form +a tensor descriptor. Tensor descriptors require a robust similarity measure due +to low numbers of aggregated vectors and the burstiness phenomenon, when a +given feature appears more/less frequently than statistically expected. The +Heat Diffusion Process (HDP) on a graph Laplacian is closely related to the +Eigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix, +whose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN +play the same role, i.e., to boost or dampen the magnitude of the eigenspectrum +thus preventing the burstiness. We equip higher-order tensors with EPN which +acts as a spectral detector of higher-order occurrences to prevent burstiness. +We also prove that for a tensor of order r built from d dimensional feature +descriptors, such a detector gives the likelihood if at least one higher-order +occurrence is 'projected' into one of binom(d,r) subspaces represented by the +tensor; thus forming a tensor power normalization metric endowed with +binom(d,r) such 'detectors'. For experimental contributions, we apply several +second- and higher-order pooling variants to action recognition, provide +previously not presented comparisons of such pooling variants, and show +state-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities. + +
+
+
+
+
+ + ♻ ☆ $ν^2$-Flows: Fast and improved neutrino reconstruction in + multi-neutrino final states with conditional normalizing flows + + +
+ In this work we introduce $\nu^2$-Flows, an extension of the $\nu$-Flows +method to final states containing multiple neutrinos. The architecture can +natively scale for all combinations of object types and multiplicities in the +final state for any desired neutrino multiplicities. In $t\bar{t}$ dilepton +events, the momenta of both neutrinos and correlations between them are +reconstructed more accurately than when using the most popular standard +analytical techniques, and solutions are found for all events. Inference time +is significantly faster than competing methods, and can be reduced further by +evaluating in parallel on graphics processing units. We apply $\nu^2$-Flows to +$t\bar{t}$ dilepton events and show that the per-bin uncertainties in unfolded +distributions is much closer to the limit of performance set by perfect +neutrino reconstruction than standard techniques. For the chosen double +differential observables $\nu^2$-Flows results in improved statistical +precision for each bin by a factor of 1.5 to 2 in comparison to the Neutrino +Weighting method and up to a factor of four in comparison to the Ellipse +approach. + +
+
+ comment: 20 pages, 16 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Topological Point Cloud Clustering ICML + + +
+ We present Topological Point Cloud Clustering (TPCC), a new method to cluster +points in an arbitrary point cloud based on their contribution to global +topological features. TPCC synthesizes desirable features from spectral +clustering and topological data analysis and is based on considering the +spectral properties of a simplicial complex associated to the considered point +cloud. As it is based on considering sparse eigenvector computations, TPCC is +similarly easy to interpret and implement as spectral clustering. However, by +focusing not just on a single matrix associated to a graph created from the +point cloud data, but on a whole set of Hodge-Laplacians associated to an +appropriately constructed simplicial complex, we can leverage a far richer set +of topological features to characterize the data points within the point cloud +and benefit from the relative robustness of topological techniques against +noise. We test the performance of TPCC on both synthetic and real-world data +and compare it with classical spectral clustering. + +
+
+ comment: Accepted at the 40th International Conference on Machine Learning + (ICML), 2023. Code available at + https://git.rwth-aachen.de/netsci/publication-2023-topological-point-cloud-clustering +
+
+
+
+
+ + ♻ ☆ My Boli: Code-mixed Marathi-English Corpora, Pretrained Language Models + and Evaluation Benchmarks + + +
+ The research on code-mixed data is limited due to the unavailability of +dedicated code-mixed datasets and pre-trained language models. In this work, we +focus on the low-resource Indian language Marathi which lacks any prior work in +code-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English +(Mr-En) corpus with 10 million social media sentences for pretraining. We also +release L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models +pre-trained on MeCorpus. Furthermore, for benchmarking, we present three +supervised datasets MeHate, MeSent, and MeLID for downstream tasks like +code-mixed Mr-En hate speech detection, sentiment analysis, and language +identification respectively. These evaluation datasets individually consist of +manually annotated \url{~}12,000 Marathi-English code-mixed tweets. Ablations +show that the models trained on this novel corpus significantly outperform the +existing state-of-the-art BERT models. This is the first work that presents +artifacts for code-mixed Marathi research. All datasets and models are publicly +released at https://github.com/l3cube-pune/MarathiNLP . + +
+
+
+
+
+ + ♻ ☆ Context-Conditional Navigation with a Learning-Based Terrain- and + Robot-Aware Dynamics Model + + +
+ In autonomous navigation settings, several quantities can be subject to +variations. Terrain properties such as friction coefficients may vary over time +depending on the location of the robot. Also, the dynamics of the robot may +change due to, e.g., different payloads, changing the system's mass, or wear +and tear, changing actuator gains or joint friction. An autonomous agent should +thus be able to adapt to such variations. In this paper, we develop a novel +probabilistic, terrain- and robot-aware forward dynamics model, termed TRADYN, +which is able to adapt to the above-mentioned variations. It builds on recent +advances in meta-learning forward dynamics models based on Neural Processes. We +evaluate our method in a simulated 2D navigation setting with a unicycle-like +robot and different terrain layouts with spatially varying friction +coefficients. In our experiments, the proposed model exhibits lower prediction +error for the task of long-horizon trajectory prediction, compared to +non-adaptive ablation models. We also evaluate our model on the downstream task +of navigation planning, which demonstrates improved performance in planning +control-efficient paths by taking robot and terrain properties into account. + +
+
+ comment: \copyright 2023 IEEE. Accepted for publication in European Conference + on Mobile Robots (ECMR), 2023. Updated copyright statement +
+
+
+
+
+ + ♻ ☆ Leveraging Offline Data in Online Reinforcement Learning + + +
+ Two central paradigms have emerged in the reinforcement learning (RL) +community: online RL and offline RL. In the online RL setting, the agent has no +prior knowledge of the environment, and must interact with it in order to find +an $\epsilon$-optimal policy. In the offline RL setting, the learner instead +has access to a fixed dataset to learn from, but is unable to otherwise +interact with the environment, and must obtain the best policy it can from this +offline data. Practical scenarios often motivate an intermediate setting: if we +have some set of offline data and, in addition, may also interact with the +environment, how can we best use the offline data to minimize the number of +online interactions necessary to learn an $\epsilon$-optimal policy? + In this work, we consider this setting, which we call the \textsf{FineTuneRL} +setting, for MDPs with linear structure. We characterize the necessary number +of online samples needed in this setting given access to some offline dataset, +and develop an algorithm, \textsc{FTPedel}, which is provably optimal, up to +$H$ factors. We show through an explicit example that combining offline data +with online interactions can lead to a provable improvement over either purely +offline or purely online RL. Finally, our results illustrate the distinction +between \emph{verifiable} learning, the typical setting considered in online +RL, and \emph{unverifiable} learning, the setting often considered in offline +RL, and show that there is a formal separation between these regimes. + +
+
+
+
+
+ + ♻ ☆ Instance-Dependent Near-Optimal Policy Identification in Linear MDPs via + Online Experiment Design + + +
+ While much progress has been made in understanding the minimax sample +complexity of reinforcement learning (RL) -- the complexity of learning on the +"worst-case" instance -- such measures of complexity often do not capture the +true difficulty of learning. In practice, on an "easy" instance, we might hope +to achieve a complexity far better than that achievable on the worst-case +instance. In this work we seek to understand the "instance-dependent" +complexity of learning near-optimal policies (PAC RL) in the setting of RL with +linear function approximation. We propose an algorithm, \textsc{Pedel}, which +achieves a fine-grained instance-dependent measure of complexity, the first of +its kind in the RL with function approximation setting, thereby capturing the +difficulty of learning on each particular problem instance. Through an explicit +example, we show that \textsc{Pedel} yields provable gains over low-regret, +minimax-optimal algorithms and that such algorithms are unable to hit the +instance-optimal rate. Our approach relies on a novel online experiment +design-based procedure which focuses the exploration budget on the "directions" +most relevant to learning a near-optimal policy, and may be of independent +interest. + +
+
+
+
+
+ + ♻ ☆ Quantitative CLTs in Deep Neural Networks + + +
+ We study the distribution of a fully connected neural network with random +Gaussian weights and biases in which the hidden layer widths are proportional +to a large constant $n$. Under mild assumptions on the non-linearity, we obtain +quantitative bounds on normal approximations valid at large but finite $n$ and +any fixed network depth. Our theorems show both for the finite-dimensional +distributions and the entire process, that the distance between a random fully +connected network (and its derivatives) to the corresponding infinite width +Gaussian process scales like $n^{-\gamma}$ for $\gamma>0$, with the exponent +depending on the metric used to measure discrepancy. Our bounds are strictly +stronger in terms of their dependence on network width than any previously +available in the literature; in the one-dimensional case, we also prove that +they are optimal, i.e., we establish matching lower bounds. + +
+
+
+
+
+ + ♻ ☆ Neural Network Complexity of Chaos and Turbulence + + +
+ Chaos and turbulence are complex physical phenomena, yet a precise definition +of the complexity measure that quantifies them is still lacking. In this work +we consider the relative complexity of chaos and turbulence from the +perspective of deep neural networks. We analyze a set of classification +problems, where the network has to distinguish images of fluid profiles in the +turbulent regime from other classes of images such as fluid profiles in the +chaotic regime, various constructions of noise and real world images. We +analyze incompressible as well as weakly compressible fluid flows. We quantify +the complexity of the computation performed by the network via the intrinsic +dimensionality of the internal feature representations, and calculate the +effective number of independent features which the network uses in order to +distinguish between classes. In addition to providing a numerical estimate of +the complexity of the computation, the measure also characterizes the neural +network processing at intermediate and final stages. We construct adversarial +examples and use them to identify the two point correlation spectra for the +chaotic and turbulent vorticity as the feature used by the network for +classification. + +
+
+
+
+
+ + ♻ ☆ Warming up recurrent neural networks to maximise reachable + multistability greatly improves learning + + +
+ Training recurrent neural networks is known to be difficult when time +dependencies become long. In this work, we show that most standard cells only +have one stable equilibrium at initialisation, and that learning on tasks with +long time dependencies generally occurs once the number of network stable +equilibria increases; a property known as multistability. Multistability is +often not easily attained by initially monostable networks, making learning of +long time dependencies between inputs and outputs difficult. This insight leads +to the design of a novel way to initialise any recurrent cell connectivity +through a procedure called "warmup" to improve its capability to learn +arbitrarily long time dependencies. This initialisation procedure is designed +to maximise network reachable multistability, i.e., the number of equilibria +within the network that can be reached through relevant input trajectories, in +few gradient steps. We show on several information restitution, sequence +classification, and reinforcement learning benchmarks that warming up greatly +improves learning speed and performance, for multiple recurrent cells, but +sometimes impedes precision. We therefore introduce a double-layer architecture +initialised with a partial warmup that is shown to greatly improve learning of +long time dependencies while maintaining high levels of precision. This +approach provides a general framework for improving learning abilities of any +recurrent cell when long time dependencies are present. We also show +empirically that other initialisation and pretraining procedures from the +literature implicitly foster reachable multistability of recurrent cells. + +
+
+ comment: 20 pages, 35 pages total, 38 figures +
+
+
+
+
+ + ♻ ☆ Implicit Multidimensional Projection of Local Subspaces + + +
+ We propose a visualization method to understand the effect of +multidimensional projection on local subspaces, using implicit function +differentiation. Here, we understand the local subspace as the multidimensional +local neighborhood of data points. Existing methods focus on the projection of +multidimensional data points, and the neighborhood information is ignored. Our +method is able to analyze the shape and directional information of the local +subspace to gain more insights into the global structure of the data through +the perception of local structures. Local subspaces are fitted by +multidimensional ellipses that are spanned by basis vectors. An accurate and +efficient vector transformation method is proposed based on analytical +differentiation of multidimensional projections formulated as implicit +functions. The results are visualized as glyphs and analyzed using a full set +of specifically-designed interactions supported in our efficient web-based +visualization tool. The usefulness of our method is demonstrated using various +multi- and high-dimensional benchmark datasets. Our implicit differentiation +vector transformation is evaluated through numerical comparisons; the overall +method is evaluated through exploration examples and use cases. + +
+
+
+
+
+ + ♻ ☆ Pre-trained Perceptual Features Improve Differentially Private Image + Generation + + +
+ Training even moderately-sized generative models with differentially-private +stochastic gradient descent (DP-SGD) is difficult: the required level of noise +for reasonable levels of privacy is simply too large. We advocate instead +building off a good, relevant representation on an informative public dataset, +then learning to model the private data with that representation. In +particular, we minimize the maximum mean discrepancy (MMD) between private +target data and a generator's distribution, using a kernel based on perceptual +features learned from a public dataset. With the MMD, we can simply privatize +the data-dependent term once and for all, rather than introducing noise at each +step of optimization as in DP-SGD. Our algorithm allows us to generate +CIFAR10-level images with $\epsilon \approx 2$ which capture distinctive +features in the distribution, far surpassing the current state of the art, +which mostly focuses on datasets such as MNIST and FashionMNIST at a large +$\epsilon \approx 10$. Our work introduces simple yet powerful foundations for +reducing the gap between private and non-private deep generative models. Our +code is available at \url{https://github.com/ParkLabML/DP-MEPF}. + +
+
+
+
+
+ + ♻ ☆ Model Selection for Generic Contextual Bandits + + +
+ We consider the problem of model selection for the general stochastic +contextual bandits under the realizability assumption. We propose a successive +refinement based algorithm called Adaptive Contextual Bandit ({\ttfamily ACB}), +that works in phases and successively eliminates model classes that are too +simple to fit the given instance. We prove that this algorithm is adaptive, +i.e., the regret rate order-wise matches that of any provable contextual bandit +algorithm (ex. \cite{falcon}), that needs the knowledge of the true model +class. The price of not knowing the correct model class turns out to be only an +additive term contributing to the second order term in the regret bound. This +cost possess the intuitive property that it becomes smaller as the model class +becomes easier to identify, and vice-versa. We also show that a much simpler +explore-then-commit (ETC) style algorithm also obtains similar regret bound, +despite not knowing the true model class. However, the cost of model selection +is higher in ETC as opposed to in {\ttfamily ACB}, as expected. Furthermore, +for the special case of linear contextual bandits, we propose specialized +algorithms that obtain sharper guarantees compared to the generic setup. + +
+
+ comment: Accepted at IEEE Transactions on Information Theory. arXiv admin + note: text overlap with arXiv:2006.02612 +
+
+
+
+
+ + ♻ ☆ Multi-view self-supervised learning for multivariate variable-channel + time series + + +
+ Labeling of multivariate biomedical time series data is a laborious and +expensive process. Self-supervised contrastive learning alleviates the need for +large, labeled datasets through pretraining on unlabeled data. However, for +multivariate time series data, the set of input channels often varies between +applications, and most existing work does not allow for transfer between +datasets with different sets of input channels. We propose learning one encoder +to operate on all input channels individually. We then use a message passing +neural network to extract a single representation across channels. We +demonstrate the potential of this method by pretraining our model on a dataset +with six EEG channels and then fine-tuning it on a dataset with two different +EEG channels. We compare models with and without the message passing neural +network across different contrastive loss functions. We show that our method, +combined with the TS2Vec loss, outperforms all other methods in most settings. + +
+
+ comment: To appear in proceedings of 2023 IEEE International workshop on + Machine Learning for Signal Processing +
+
+
+
+
+ + ♻ ☆ Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis MICCAI 2023 + + +
+ Regular group convolutional neural networks (G-CNNs) have been shown to +increase model performance and improve equivariance to different geometrical +symmetries. This work addresses the problem of SE(3), i.e., roto-translation +equivariance, on volumetric data. Volumetric image data is prevalent in many +medical settings. Motivated by the recent work on separable group convolutions, +we devise a SE(3) group convolution kernel separated into a continuous SO(3) +(rotation) kernel and a spatial kernel. We approximate equivariance to the +continuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel +is parameterized via RBF interpolation on similarly uniform grids. We +demonstrate the advantages of our approach in volumetric medical image +analysis. Our SE(3) equivariant models consistently outperform CNNs and regular +discrete G-CNNs on challenging medical classification tasks and show +significantly improved generalization capabilities. Our approach achieves up to +a 16.5% gain in accuracy over regular CNNs. + +
+
+ comment: 10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated + version to camera ready version 1 +
+
+
+
+
+ + ♻ ☆ HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory + Prediction via Scene Encoding + + +
+ Encoding a driving scene into vector representations has been an essential +task for autonomous driving that can benefit downstream tasks e.g. trajectory +prediction. The driving scene often involves heterogeneous elements such as the +different types of objects (agents, lanes, traffic signs) and the semantic +relations between objects are rich and diverse. Meanwhile, there also exist +relativity across elements, which means that the spatial relation is a relative +concept and need be encoded in a ego-centric manner instead of in a global +coordinate system. Based on these observations, we propose Heterogeneous +Driving Graph Transformer (HDGT), a backbone modelling the driving scene as a +heterogeneous graph with different types of nodes and edges. For heterogeneous +graph construction, we connect different types of nodes according to diverse +semantic relations. For spatial relation encoding, the coordinates of the node +as well as its in-edges are in the local node-centric coordinate system. For +the aggregation module in the graph neural network (GNN), we adopt the +transformer structure in a hierarchical way to fit the heterogeneous nature of +inputs. Experimental results show that HDGT achieves state-of-the-art +performance for the task of trajectory prediction, on INTERACTION Prediction +Challenge and Waymo Open Motion Challenge. + +
+
+ comment: Accepted at IEEE TPAMI in 2023. Code url: + https://github.com/OpenDriveLab/HDGT +
+
+
+
+
+ + ♻ ☆ Navya3DSeg -- Navya 3D Semantic Segmentation Dataset & split generation + for autonomous vehicles + + +
+ Autonomous driving (AD) perception today relies heavily on deep learning +based architectures requiring large scale annotated datasets with their +associated costs for curation and annotation. The 3D semantic data are useful +for core perception tasks such as obstacle detection and ego-vehicle +localization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg), +with a diverse label space corresponding to a large scale production grade +operational domain, including rural, urban, industrial sites and universities +from 13 countries. It contains 23 labeled sequences and 25 supplementary +sequences without labels, designed to explore self-supervised and +semi-supervised semantic segmentation benchmarks on point clouds. We also +propose a novel method for sequential dataset split generation based on +iterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU +improvement over the original split proposed by SemanticKITTI dataset. A +complete benchmark for semantic segmentation task was performed, with state of +the art methods. Finally, we demonstrate an Active Learning (AL) based dataset +distillation framework. We introduce a novel heuristic-free sampling method +called ego-pose distance based sampling in the context of AL. A detailed +presentation on the dataset is available here +https://www.youtube.com/watch?v=5m6ALIs-s20. + +
+
+ comment: Accepted version to IEEE RA-L. Version with supplementary materials +
+
+
+
+
+ + ♻ ☆ Efficient Action Robust Reinforcement Learning with Probabilistic Policy + Execution Uncertainty + + +
+ Robust reinforcement learning (RL) aims to find a policy that optimizes the +worst-case performance in the face of uncertainties. In this paper, we focus on +action robust RL with the probabilistic policy execution uncertainty, in which, +instead of always carrying out the action specified by the policy, the agent +will take the action specified by the policy with probability $1-\rho$ and an +alternative adversarial action with probability $\rho$. We establish the +existence of an optimal policy on the action robust MDPs with probabilistic +policy execution uncertainty and provide the action robust Bellman optimality +equation for its solution. Furthermore, we develop Action Robust Reinforcement +Learning with Certificates (ARRLC) algorithm that achieves minimax optimal +regret and sample complexity. Furthermore, we conduct numerical experiments to +validate our approach's robustness, demonstrating that ARRLC outperforms +non-robust RL algorithms and converges faster than the robust TD algorithm in +the presence of action perturbations. + +
+
+
+
+
+ + ♻ ☆ AUC Optimization from Multiple Unlabeled Datasets + + +
+ Weakly supervised learning aims to empower machine learning when the perfect +supervision is unavailable, which has drawn great attention from researchers. +Among various types of weak supervision, one of the most challenging cases is +to learn from multiple unlabeled (U) datasets with only a little knowledge of +the class priors, or U$^m$ learning for short. In this paper, we study the +problem of building an AUC (area under ROC curve) optimization model from +multiple unlabeled datasets, which maximizes the pairwise ranking ability of +the classifier. We propose U$^m$-AUC, an AUC optimization approach that +converts the U$^m$ data into a multi-label AUC optimization problem, and can be +trained efficiently. We show that the proposed U$^m$-AUC is effective +theoretically and empirically. + +
+
+
+
+
+ + ♻ ☆ Opinion Market Model: Stemming Far-Right Opinion Spread using Positive + Interventions AAAI + + +
+ Online extremism has severe societal consequences, including normalizing hate +speech, user radicalization, and increased social divisions. Various mitigation +strategies have been explored to address these consequences. One such strategy +uses positive interventions: controlled signals that add attention to the +opinion ecosystem to boost certain opinions. To evaluate the effectiveness of +positive interventions, we introduce the Opinion Market Model (OMM), a two-tier +online opinion ecosystem model that considers both inter-opinion interactions +and the role of positive interventions. The size of the opinion attention +market is modeled in the first tier using the multivariate discrete-time Hawkes +process; in the second tier, opinions cooperate and compete for market share, +given limited attention using the market share attraction model. We demonstrate +the convergence of our proposed estimation scheme on a synthetic dataset. Next, +we test OMM on two learning tasks, applying to two real-world datasets to +predict attention market shares and uncover latent relationships between online +items. The first dataset comprises Facebook and Twitter discussions containing +moderate and far-right opinions about bushfires and climate change. The second +dataset captures popular VEVO artists' YouTube and Twitter attention volumes. +OMM outperforms the state-of-the-art predictive models on both datasets and +captures latent cooperation-competition relations. We uncover (1) self- and +cross-reinforcement between far-right and moderate opinions on the bushfires +and (2) pairwise artist relations that correlate with real-world interactions +such as collaborations and long-lasting feuds. Lastly, we use OMM as a testbed +for positive interventions and show how media coverage modulates the spread of +far-right opinions. + +
+
+ comment: accepted in the 18th AAAI International Conference on Web and Social + Media (ICWSM'24) +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ Convolutional Neural Networks (CNNs) have made significant strides in medical +image analysis in recent years. However, the local nature of the convolution +operator may pose a limitation for capturing global and long-range interactions +in CNNs. Recently, Transformers have gained popularity in the computer vision +community and also medical image segmentation due to their ability to process +global features effectively. The scalability issues of self-attention mechanism +and lack of the CNN-like inductive bias may have limited their adoption. +Therefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages +of both Convolution and Self-attention Mechanisms, have gained importance. In +this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision +transformer (CNN-Transformer) for medical image segmentation. The proposed +Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both +the convolution and self-attention mechanisms at each decoding stage with +nominal computational burden. The inclusion of multi-axis self-attention, +within each decoder stage, significantly enhances the discriminating capacity +between the object and background regions, and thereby helps in improving the +segmentation efficiency. In the Hybrid Decoder block, the fusion process +commences by integrating the upsampled lower level decoder features, obtained +through transpose convolution, with the skip-connection features derived from +the hybrid encoder. Subsequently, the fused features undergo refinement through +the utilization of a multi-axis attention mechanism. The proposed decoder block +is repeated multiple times to progressively segment the nuclei regions. +Experimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the +effectiveness of the proposed technique. + +
+
+
+
+
+ + ♻ ☆ Positive unlabeled learning with tensor networks + + +
+ Positive unlabeled learning is a binary classification problem with positive +and unlabeled data. It is common in domains where negative labels are costly or +impossible to obtain, e.g., medicine and personalized advertising. Most +approaches to positive unlabeled learning apply to specific data types (e.g., +images, categorical data) and can not generate new positive and negative +samples. This work introduces a feature-space distance-based tensor network +approach to the positive unlabeled learning problem. The presented method is +not domain specific and significantly improves the state-of-the-art results on +the MNIST image and 15 categorical/mixed datasets. The trained tensor network +model is also a generative model and enables the generation of new positive and +negative instances. + +
+
+ comment: 12 pages, 6 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Multimodal LLMs for health grounded in individual-specific data + + +
+ Foundation large language models (LLMs) have shown an impressive ability to +solve tasks across a wide range of fields including health. To effectively +solve personalized health tasks, LLMs need the ability to ingest a diversity of +data modalities that are relevant to an individual's health status. In this +paper, we take a step towards creating multimodal LLMs for health that are +grounded in individual-specific data by developing a framework (HeLM: Health +Large Language Model for Multimodal Understanding) that enables LLMs to use +high-dimensional clinical modalities to estimate underlying disease risk. HeLM +encodes complex data modalities by learning an encoder that maps them into the +LLM's token embedding space and for simple modalities like tabular data by +serializing the data into text. Using data from the UK Biobank, we show that +HeLM can effectively use demographic and clinical features in addition to +high-dimensional time-series data to estimate disease risk. For example, HeLM +achieves an AUROC of 0.75 for asthma prediction when combining tabular and +spirogram data modalities compared with 0.49 when only using tabular data. +Overall, we find that HeLM outperforms or performs at parity with classical +machine learning approaches across a selection of eight binary traits. +Furthermore, we investigate the downstream uses of this model such as its +generalizability to out-of-distribution traits and its ability to power +conversations around individual health and wellness. + +
+
+
+
+
+ + ♻ ☆ Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking + and Machine Learning Regression Approach + + +
+ The COVID-19 pandemic has created a global health crisis, driving the need +for the rapid identification of potential therapeutics. To meet this challenge, +drug repurposing is the only solution with saving cost, time, and labor. In +this study, we used the Zinc database to screen the world-approved including +FDA-approved 5903 drugs for repurposing as potential COVID-19 treatments +targeting the main protease 3CL of SARS-CoV-2. We performed molecular docking +and checked the efficacy of drug molecules. To enhance the efficiency of drug +repurposing approach, we modeled the binding affinities using several machine +learning regression approaches for QSAR modeling such as decision tree, extra +trees, MLP, KNN, XGBoost, and gradient boosting. The computational results +demonstrated that Decision Tree Regression (DTR) model has improved statistical +measures of R2 and RMSE. These simulated results helped to identify drugs with +high binding affinity. From the docking and other statistical analysis, we +shortlisted six promising drugs with their respective Zinc IDs (ZINC3873365, +ZINC85432544, ZINC203757351, ZINC85536956, ZINC8214470 and ZINC261494640) +within the range of -15 kcal/mol to -13 kcal/mol. In the study, the repurposed +drugs are novel except ZINC203757351 antiviral compound that has already +identified against COVID-19 in other studies. Further, we analyzed the +physiochemical and pharmacokinetic properties of these top-ranked selected +drugs with respect to their best binding interaction for specific target +protease 3CLpro. Our study has provided an efficient framework for drug +repurposing against COVID-19. This highlights the potential of combining +molecular docking with machine learning regression approaches to accelerate the +identification of potential therapeutic candidates. + +
+
+ comment: 27 Pages +
+
+
+
+
+ + ♻ ☆ Robust Principal Component Analysis: A Median of Means Approach + + +
+ Principal Component Analysis (PCA) is a fundamental tool for data +visualization, denoising, and dimensionality reduction. It is widely popular in +Statistics, Machine Learning, Computer Vision, and related fields. However, PCA +is well-known to fall prey to outliers and often fails to detect the true +underlying low-dimensional structure within the dataset. Following the Median +of Means (MoM) philosophy, recent supervised learning methods have shown great +success in dealing with outlying observations without much compromise to their +large sample theoretical properties. This paper proposes a PCA procedure based +on the MoM principle. Called the \textbf{M}edian of \textbf{M}eans +\textbf{P}rincipal \textbf{C}omponent \textbf{A}nalysis (MoMPCA), the proposed +method is not only computationally appealing but also achieves optimal +convergence rates under minimal assumptions. In particular, we explore the +non-asymptotic error bounds of the obtained solution via the aid of the +Rademacher complexities while granting absolutely no assumption on the outlying +observations. The derived concentration results are not dependent on the +dimension because the analysis is conducted in a separable Hilbert space, and +the results only depend on the fourth moment of the underlying distribution in +the corresponding norm. The proposal's efficacy is also thoroughly showcased +through simulations and real data applications. + +
+
+
+
+
+ + ♻ ☆ Friendly Noise against Adversarial Noise: A Powerful Defense against + Data Poisoning Attacks + + +
+ A powerful category of (invisible) data poisoning attacks modify a subset of +training examples by small adversarial perturbations to change the prediction +of certain test-time data. Existing defense mechanisms are not desirable to +deploy in practice, as they often either drastically harm the generalization +performance, or are attack-specific, and prohibitively slow to apply. Here, we +propose a simple but highly effective approach that unlike existing methods +breaks various types of invisible poisoning attacks with the slightest drop in +the generalization performance. We make the key observation that attacks +introduce local sharp regions of high training loss, which when minimized, +results in learning the adversarial perturbations and makes the attack +successful. To break poisoning attacks, our key idea is to alleviate the sharp +loss regions introduced by poisons. To do so, our approach comprises two +components: an optimized friendly noise that is generated to maximally perturb +examples without degrading the performance, and a randomly varying noise +component. The combination of both components builds a very light-weight but +extremely effective defense against the most powerful triggerless targeted and +hidden-trigger backdoor poisoning attacks, including Gradient Matching, +Bulls-eye Polytope, and Sleeper Agent. We show that our friendly noise is +transferable to other architectures, and adaptive attacks cannot break our +defense due to its random noise component. Our code is available at: +https://github.com/tianyu139/friendly-noise + +
+
+ comment: Code available at: https://github.com/tianyu139/friendly-noise +
+
+
+
+
+ + ♻ ☆ Data-Efficient Augmentation for Training Neural Networks + + +
+ Data augmentation is essential to achieve state-of-the-art performance in +many deep learning applications. However, the most effective augmentation +techniques become computationally prohibitive for even medium-sized datasets. +To address this, we propose a rigorous technique to select subsets of data +points that when augmented, closely capture the training dynamics of full data +augmentation. We first show that data augmentation, modeled as additive +perturbations, improves learning and generalization by relatively enlarging and +perturbing the smaller singular values of the network Jacobian, while +preserving its prominent directions. This prevents overfitting and enhances +learning the harder to learn information. Then, we propose a framework to +iteratively extract small subsets of training data that when augmented, closely +capture the alignment of the fully augmented Jacobian with labels/residuals. We +prove that stochastic gradient descent applied to the augmented subsets found +by our approach has similar training dynamics to that of fully augmented data. +Our experiments demonstrate that our method achieves 6.3x speedup on CIFAR10 +and 2.2x speedup on SVHN, and outperforms the baselines by up to 10% across +various subset sizes. Similarly, on TinyImageNet and ImageNet, our method beats +the baselines by up to 8%, while achieving up to 3.3x speedup across various +subset sizes. Finally, training on and augmenting 50% subsets using our method +on a version of CIFAR10 corrupted with label noise even outperforms using the +full dataset. Our code is available at: +https://github.com/tianyu139/data-efficient-augmentation + +
+
+ comment: Code available at: + https://github.com/tianyu139/data-efficient-augmentation +
+
+
+
+
+ + ♻ ☆ Pythae: Unifying Generative Autoencoders in Python -- A Benchmarking Use + Case NeurIPS 2022 + + +
+ In recent years, deep generative models have attracted increasing interest +due to their capacity to model complex distributions. Among those models, +variational autoencoders have gained popularity as they have proven both to be +computationally efficient and yield impressive results in multiple fields. +Following this breakthrough, extensive research has been done in order to +improve the original publication, resulting in a variety of different VAE +models in response to different tasks. In this paper we present Pythae, a +versatile open-source Python library providing both a unified implementation +and a dedicated framework allowing straightforward, reproducible and reliable +use of generative autoencoder models. We then propose to use this library to +perform a case study benchmark where we present and compare 19 generative +autoencoder models representative of some of the main improvements on +downstream tasks such as image reconstruction, generation, classification, +clustering and interpolation. The open-source library can be found at +https://github.com/clementchadebec/benchmark_VAE. + +
+
+ comment: Accepted to NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ Nonuniqueness and Convergence to Equivalent Solutions in Observer-based + Inverse Reinforcement Learning + + +
+ A key challenge in solving the deterministic inverse reinforcement learning +(IRL) problem online and in real-time is the existence of multiple solutions. +Nonuniqueness necessitates the study of the notion of equivalent solutions, +i.e., solutions that result in a different cost functional but same feedback +matrix, and convergence to such solutions. While offline algorithms that result +in convergence to equivalent solutions have been developed in the literature, +online, real-time techniques that address nonuniqueness are not available. In +this paper, a regularized history stack observer that converges to +approximately equivalent solutions of the IRL problem is developed. Novel +data-richness conditions are developed to facilitate the analysis and +simulation results are provided to demonstrate the effectiveness of the +developed technique. + +
+
+ comment: 16 pages, 7 figures, submitted to American Controls Conference 2023 +
+
+
+
+
+ + ♻ ☆ Tangent Transformers for Composition, Privacy and Removal + + +
+ We introduce Tangent Attention Fine-Tuning (TAFT), a method for fine-tuning +linearized transformers obtained by computing a First-order Taylor Expansion +around a pre-trained initialization. We show that the Jacobian-Vector Product +resulting from linearization can be computed efficiently in a single forward +pass, reducing training and inference cost to the same order of magnitude as +its original non-linear counterpart, while using the same number of parameters. +Furthermore, we show that, when applied to various downstream visual +classification tasks, the resulting Tangent Transformer fine-tuned with TAFT +can perform comparably with fine-tuning the original non-linear network. Since +Tangent Transformers are linear with respect to the new set of weights, and the +resulting fine-tuning loss is convex, we show that TAFT enjoys several +advantages compared to non-linear fine-tuning when it comes to model +composition, parallel training, machine unlearning, and differential privacy. + +
+
+
+
+
+ + ♻ ☆ Emotion-Conditioned Melody Harmonization with Hierarchical Variational + Autoencoder + + +
+ Existing melody harmonization models have made great progress in improving +the quality of generated harmonies, but most of them ignored the emotions +beneath the music. Meanwhile, the variability of harmonies generated by +previous methods is insufficient. To solve these problems, we propose a novel +LSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the +influence of emotional conditions on melody harmonization, while improving the +quality of generated harmonies and capturing the abundant variability of chord +progressions. Specifically, LHVAE incorporates latent variables and emotional +conditions at different levels (piece- and bar-level) to model the global and +local music properties. Additionally, we introduce an attention-based melody +context vector at each step to better learn the correspondence between melodies +and harmonies. Objective experimental results show that our proposed model +outperforms other LSTM-based models. Through subjective evaluation, we conclude +that only altering the types of chords hardly changes the overall emotion of +the music. The qualitative analysis demonstrates the ability of our model to +generate variable harmonies. + +
+
+ comment: Accepted by IEEE SMC 2023 +
+
+
+
+
+ + ♻ ☆ Improving Uncertainty Quantification of Variance Networks by + Tree-Structured Learning + + +
+ To improve the uncertainty quantification of variance networks, we propose a +novel tree-structured local neural network model that partitions the feature +space into multiple regions based on uncertainty heterogeneity. A tree is built +upon giving the training data, whose leaf nodes represent different regions +where region-specific neural networks are trained to predict both the mean and +the variance for quantifying uncertainty. The proposed Uncertainty-Splitting +Neural Regression Tree (USNRT) employs novel splitting criteria. At each node, +a neural network is trained on the full data first, and a statistical test for +the residuals is conducted to find the best split, corresponding to the two +sub-regions with the most significant uncertainty heterogeneity between them. +USNRT is computationally friendly because very few leaf nodes are sufficient +and pruning is unnecessary. Furthermore, an ensemble version can be easily +constructed to estimate the total uncertainty including the aleatory and +epistemic. On extensive UCI datasets, USNRT or its ensemble shows superior +performance compared to some recent popular methods for quantifying uncertainty +with variances. Through comprehensive visualization and analysis, we uncover +how USNRT works and show its merits, revealing that uncertainty heterogeneity +does exist in many datasets and can be learned by USNRT. + +
+
+
+
+
+ + ♻ ☆ It Is All About Data: A Survey on the Effects of Data on Adversarial + Robustness + + +
+ Adversarial examples are inputs to machine learning models that an attacker +has intentionally designed to confuse the model into making a mistake. Such +examples pose a serious threat to the applicability of machine-learning-based +systems, especially in life- and safety-critical domains. To address this +problem, the area of adversarial robustness investigates mechanisms behind +adversarial attacks and defenses against these attacks. This survey reviews a +particular subset of this literature that focuses on investigating properties +of training data in the context of model robustness under evasion attacks. It +first summarizes the main properties of data leading to adversarial +vulnerability. It then discusses guidelines and techniques for improving +adversarial robustness by enhancing the data representation and learning +procedures, as well as techniques for estimating robustness guarantees given +particular data. Finally, it discusses gaps of knowledge and promising future +research directions in this area. + +
+
+ comment: 51 pages, 25 figures, under review +
+
+
+
+
+ + ♻ ☆ Deep-Q Learning with Hybrid Quantum Neural Network on Solving Maze + Problems + + +
+ Quantum computing holds great potential for advancing the limitations of +machine learning algorithms to handle higher data dimensions and reduce overall +training parameters in deep neural network (DNN) models. This study uses a +parameterized quantum circuit (PQC) on a gate-based quantum computer to +investigate the potential for quantum advantage in a model-free reinforcement +learning problem. Through a comprehensive investigation and evaluation of the +current model and capabilities of quantum computers, we designed and trained a +novel hybrid Quantum neural network based on the latest Qiskit and PyTorch +framework. We compared its performance with a full-classical DNN with and +without an integrated PQC. Our research provides insights into the potential of +deep quantum learning to solve a maze problem and, potentially, other +reinforcement learning problems. We conclude that various reinforcement +learning problems can be effective with reasonable training epochs. Moreover, a +comparative discussion of the various quantum reinforcement learning model on +maze problems is discussed to evaluate our research's overall potential and +advantages. + +
+
+
+
+
+ + ♻ ☆ MultiRobustBench: Benchmarking Robustness Against Multiple Attacks ICML 2023 + + +
+ The bulk of existing research in defending against adversarial examples +focuses on defending against a single (typically bounded Lp-norm) attack, but +for a practical setting, machine learning (ML) models should be robust to a +wide variety of attacks. In this paper, we present the first unified framework +for considering multiple attacks against ML models. Our framework is able to +model different levels of learner's knowledge about the test-time adversary, +allowing us to model robustness against unforeseen attacks and robustness +against unions of attacks. Using our framework, we present the first +leaderboard, MultiRobustBench, for benchmarking multiattack evaluation which +captures performance across attack types and attack strengths. We evaluate the +performance of 16 defended models for robustness against a set of 9 different +attack types, including Lp-based threat models, spatial transformations, and +color changes, at 20 different attack strengths (180 attacks total). +Additionally, we analyze the state of current defenses against multiple +attacks. Our analysis shows that while existing defenses have made progress in +terms of average robustness across the set of attacks used, robustness against +the worst-case attack is still a big open problem as all existing models +perform worse than random guessing. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ♻ ☆ AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide + for Simultaneous Speech Translation + + +
+ Attention is the core mechanism of today's most used architectures for +natural language processing and has been analyzed from many perspectives, +including its effectiveness for machine translation-related tasks. Among these +studies, attention resulted to be a useful source of information to get +insights about word alignment also when the input text is substituted with +audio segments, as in the case of the speech translation (ST) task. In this +paper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that +exploits the attention information to generate source-target alignments that +guide the model during inference. Through experiments on the 8 language pairs +of MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art +SimulST policies applied to offline-trained models with gains in terms of BLEU +of 2 points and latency reductions ranging from 0.5s to 0.8s across the 8 +languages. + +
+
+ comment: Accepted at Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Solvent: A Framework for Protein Folding + + +
+ Consistency and reliability are crucial for conducting AI research. Many +famous research fields, such as object detection, have been compared and +validated with solid benchmark frameworks. After AlphaFold2, the protein +folding task has entered a new phase, and many methods are proposed based on +the component of AlphaFold2. The importance of a unified research framework in +protein folding contains implementations and benchmarks to consistently and +fairly compare various approaches. To achieve this, we present Solvent, an +protein folding framework that supports significant components of +state-of-the-art models in the manner of off-the-shelf interface Solvent +contains different models implemented in a unified codebase and supports +training and evaluation for defined models on the same dataset. We benchmark +well-known algorithms and their components and provide experiments that give +helpful insights into the protein structure modeling field. We hope that +Solvent will increase the reliability and consistency of proposed models and +gives efficiency in both speed and costs, resulting in acceleration on protein +folding modeling research. The code is available at +https://github.com/kakaobrain/solvent, and the project will continue to be +developed. + +
+
+ comment: preprint, 8pages +
+
+
+
+
+ + ♻ ☆ Efficient Guided Generation for Large Language Models + + +
+ In this article we describe an efficient approach to guiding language model +text generation with regular expressions and context-free grammars. Our +approach adds little to no overhead to the token sequence generation process, +and makes guided generation feasible in practice. An implementation is provided +in the open source Python library Outlines. + +
+
+
+
+
+ + ♻ ☆ Shift-Robust Molecular Relational Learning with Causal Substructure KDD 2023 + + +
+ Recently, molecular relational learning, whose goal is to predict the +interaction behavior between molecular pairs, got a surge of interest in +molecular sciences due to its wide range of applications. In this work, we +propose CMRL that is robust to the distributional shift in molecular relational +learning by detecting the core substructure that is causally related to +chemical reactions. To do so, we first assume a causal relationship based on +the domain knowledge of molecular sciences and construct a structural causal +model (SCM) that reveals the relationship between variables. Based on the SCM, +we introduce a novel conditional intervention framework whose intervention is +conditioned on the paired molecule. With the conditional intervention +framework, our model successfully learns from the causal substructure and +alleviates the confounding effect of shortcut substructures that are spuriously +correlated to chemical reactions. Extensive experiments on various tasks with +real-world and synthetic datasets demonstrate the superiority of CMRL over +state-of-the-art baseline models. Our code is available at +https://github.com/Namkyeong/CMRL. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ Computing the gradients with respect to all parameters of a quantum + neural network using a single circuit + + +
+ When computing the gradients of a quantum neural network using the +parameter-shift rule, the cost function needs to be calculated twice for the +gradient with respect to a single adjustable parameter of the network. When the +total number of parameters is high, the quantum circuit for the computation has +to be adjusted and run for many times. Here we propose an approach to compute +all the gradients using a single circuit only, with a much reduced circuit +depth and less classical registers. We also demonstrate experimentally, on both +real quantum hardware and simulator, that our approach has the advantages that +the circuit takes a significantly shorter time to compile than the conventional +approach, resulting in a speedup on the total runtime. + +
+
+ comment: Added a suggestion on improving real quantum computers +
+
+
+
+
+ + ♻ ☆ Identifying the Hazard Boundary of ML-enabled Autonomous Systems Using + Cooperative Co-Evolutionary Search + + +
+ In Machine Learning (ML)-enabled autonomous systems (MLASs), it is essential +to identify the hazard boundary of ML Components (MLCs) in the MLAS under +analysis. Given that such boundary captures the conditions in terms of MLC +behavior and system context that can lead to hazards, it can then be used to, +for example, build a safety monitor that can take any predefined fallback +mechanisms at runtime when reaching the hazard boundary. However, determining +such hazard boundary for an ML component is challenging. This is due to the +problem space combining system contexts (i.e., scenarios) and MLC behaviors +(i.e., inputs and outputs) being far too large for exhaustive exploration and +even to handle using conventional metaheuristics, such as genetic algorithms. +Additionally, the high computational cost of simulations required to determine +any MLAS safety violations makes the problem even more challenging. +Furthermore, it is unrealistic to consider a region in the problem space +deterministically safe or unsafe due to the uncontrollable parameters in +simulations and the non-linear behaviors of ML models (e.g., deep neural +networks) in the MLAS under analysis. To address the challenges, we propose +MLCSHE (ML Component Safety Hazard Envelope), a novel method based on a +Cooperative Co-Evolutionary Algorithm (CCEA), which aims to tackle a +high-dimensional problem by decomposing it into two lower-dimensional search +subproblems. Moreover, we take a probabilistic view of safe and unsafe regions +and define a novel fitness function to measure the distance from the +probabilistic hazard boundary and thus drive the search effectively. We +evaluate the effectiveness and efficiency of MLCSHE on a complex Autonomous +Vehicle (AV) case study. Our evaluation results show that MLCSHE is +significantly more effective and efficient compared to a standard genetic +algorithm and random search. + +
+
+
+
+
+ + ♻ ☆ Provably Faster Gradient Descent via Long Steps + + +
+ This work establishes provably faster convergence rates for gradient descent +in smooth convex optimization via a computer-assisted analysis technique. Our +theory allows nonconstant stepsize policies with frequent long steps +potentially violating descent by analyzing the overall effect of many +iterations at once rather than the typical one-iteration inductions used in +most first-order method analyses. We show that long steps, which may increase +the objective value in the short term, lead to provably faster convergence in +the long term. A conjecture towards proving a faster $O(1/T\log T)$ rate for +gradient descent is also motivated along with simple numerical validation. + +
+
+ comment: Apologies for the several updates done shortly after first posting + this work: In these, I have added more references to excellent relevant works + I missed in my initial literature review, esp the Master's thesis of Jason + Altschuler +
+
+
+
+
+ + ♻ ☆ Preprocessors Matter! Realistic Decision-Based Attacks on Machine + Learning Systems ICML 2023 + + +
+ Decision-based attacks construct adversarial examples against a machine +learning (ML) model by making only hard-label queries. These attacks have +mainly been applied directly to standalone neural networks. However, in +practice, ML models are just one component of a larger learning system. We find +that by adding a single preprocessor in front of a classifier, state-of-the-art +query-based attacks are up to 7$\times$ less effective at attacking a +prediction pipeline than at attacking the model alone. We explain this +discrepancy by the fact that most preprocessors introduce some notion of +invariance to the input space. Hence, attacks that are unaware of this +invariance inevitably waste a large number of queries to re-discover or +overcome it. We, therefore, develop techniques to (i) reverse-engineer the +preprocessor and then (ii) use this extracted information to attack the +end-to-end system. Our preprocessors extraction method requires only a few +hundred queries, and our preprocessor-aware attacks recover the same efficacy +as when attacking the model alone. The code can be found at +https://github.com/google-research/preprocessor-aware-black-box-attack. + +
+
+ comment: ICML 2023. Code can be found at + https://github.com/google-research/preprocessor-aware-black-box-attack +
+
+
+
+
+ + ♻ ☆ High-dimensional and Permutation Invariant Anomaly Detection + + +
+ Methods for anomaly detection of new physics processes are often limited to +low-dimensional spaces due to the difficulty of learning high-dimensional +probability densities. Particularly at the constituent level, incorporating +desirable properties such as permutation invariance and variable-length inputs +becomes difficult within popular density estimation methods. In this work, we +introduce a permutation-invariant density estimator for particle physics data +based on diffusion models, specifically designed to handle variable-length +inputs. We demonstrate the efficacy of our methodology by utilizing the learned +density as a permutation-invariant anomaly detection score, effectively +identifying jets with low likelihood under the background-only hypothesis. To +validate our density estimation method, we investigate the ratio of learned +densities and compare to those obtained by a supervised classification +algorithm. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A Convergence Rate for Manifold Neural Networks + + +
+ High-dimensional data arises in numerous applications, and the rapidly +developing field of geometric deep learning seeks to develop neural network +architectures to analyze such data in non-Euclidean domains, such as graphs and +manifolds. Recent work by Z. Wang, L. Ruiz, and A. Ribeiro has introduced a +method for constructing manifold neural networks using the spectral +decomposition of the Laplace Beltrami operator. Moreover, in this work, the +authors provide a numerical scheme for implementing such neural networks when +the manifold is unknown and one only has access to finitely many sample points. +The authors show that this scheme, which relies upon building a data-driven +graph, converges to the continuum limit as the number of sample points tends to +infinity. Here, we build upon this result by establishing a rate of convergence +that depends on the intrinsic dimension of the manifold but is independent of +the ambient dimension. We also discuss how the rate of convergence depends on +the depth of the network and the number of filters used in each layer. + +
+
+
+
+
+ + ♻ ☆ Monotonic Risk Relationships under Distribution Shifts for Regularized + Risk Minimization + + +
+ Machine learning systems are often applied to data that is drawn from a +different distribution than the training distribution. Recent work has shown +that for a variety of classification and signal reconstruction problems, the +out-of-distribution performance is strongly linearly correlated with the +in-distribution performance. If this relationship or more generally a monotonic +one holds, it has important consequences. For example, it allows to optimize +performance on one distribution as a proxy for performance on the other. In +this paper, we study conditions under which a monotonic relationship between +the performances of a model on two distributions is expected. We prove an exact +asymptotic linear relation for squared error and a monotonic relation for +misclassification error for ridge-regularized general linear models under +covariate shift, as well as an approximate linear relation for linear inverse +problems. + +
+
+ comment: 34 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization + Using Floating-Point Formats + + +
+ In the complex domain of large language models (LLMs), striking a balance +between computational efficiency and maintaining model quality is a formidable +challenge. Navigating the inherent limitations of uniform quantization, +particularly when dealing with outliers, and motivated by the launch of +NVIDIA's H100 hardware, this study delves into the viability of floating-point +(FP) quantization, particularly focusing on FP8 and FP4, as a potential +solution. Our comprehensive investigation reveals that for LLMs, FP8 activation +consistently outshines its integer (INT8) equivalent, with the performance edge +becoming more noticeable in models possessing parameters beyond one billion. +For weight quantization, our findings indicate that FP4 exhibits comparable, if +not superior, performance to INT4, simplifying deployment on FP-supported +hardware like H100. To mitigate the overhead from precision alignment caused by +the disparity between weights and activations, we propose two scaling +constraints for weight quantization that negligibly impact the performance +compared to the standard W4A8 model. We additionally enhance our quantization +methods by integrating the Low Rank Compensation (LoRC) strategy, yielding +improvements especially in smaller models. The results of our investigation +emphasize the immense potential of FP quantization for LLMs, paving the way for +high-efficiency deployment in resource-limited settings. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Investigating VTubing as a Reconstruction of Streamer Self-Presentation: + Identity, Performance, and Gender SC + + +
+ VTubers, or Virtual YouTubers, are live streamers who create streaming +content using animated 2D or 3D virtual avatars. In recent years, there has +been a significant increase in the number of VTuber creators and viewers across +the globe. This practise has drawn research attention into topics such as +viewers' engagement behaviors and perceptions, however, as animated avatars +offer more identity and performance flexibility than traditional live streaming +where one uses their own body, little research has focused on how this +flexibility influences how creators present themselves. This research thus +seeks to fill this gap by presenting results from a qualitative study of 16 +Chinese-speaking VTubers' streaming practices. The data revealed that the +virtual avatars that were used while live streaming afforded creators +opportunities to present themselves using inflated presentations and resulted +in inclusive interactions with viewers. The results also unveiled the inflated, +and often sexualized, gender expressions of VTubers while they were situated in +misogynistic environments. The socio-technical facets of VTubing were found to +potentially reduce sexual harassment and sexism, whilst also raising +self-objectification concerns. + +
+
+ comment: Under review at ACM CSCW after a Major Revision +
+
+
+
+
+ + ☆ Meta-Transformer: A Unified Framework for Multimodal Learning + + +
+ Multimodal learning aims to build models that can process and relate +information from multiple modalities. Despite years of development in this +field, it still remains challenging to design a unified network for processing +various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point +clouds, audio, video, time series, tabular data) due to the inherent gaps among +them. In this work, we propose a framework, named Meta-Transformer, that +leverages a $\textbf{frozen}$ encoder to perform multimodal perception without +any paired multimodal training data. In Meta-Transformer, the raw input data +from various modalities are mapped into a shared token space, allowing a +subsequent encoder with frozen parameters to extract high-level semantic +features of the input data. Composed of three main components: a unified data +tokenizer, a modality-shared encoder, and task-specific heads for downstream +tasks, Meta-Transformer is the first framework to perform unified learning +across 12 modalities with unpaired data. Experiments on different benchmarks +reveal that Meta-Transformer can handle a wide range of tasks including +fundamental perception (text, image, point cloud, audio, video), practical +application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph, +tabular, and time-series). Meta-Transformer indicates a promising future for +developing unified multimodal intelligence with transformers. Code will be +available at https://github.com/invictus717/MetaTransformer + +
+
+ comment: Project website: https://kxgong.github.io/meta_transformer/ +
+
+
+
+
+ + ☆ RetouchingFFHQ: A Large-scale Dataset for Fine-grained Face Retouching + Detection + + +
+ The widespread use of face retouching filters on short-video platforms has +raised concerns about the authenticity of digital appearances and the impact of +deceptive advertising. To address these issues, there is a pressing need to +develop advanced face retouching techniques. However, the lack of large-scale +and fine-grained face retouching datasets has been a major obstacle to progress +in this field. In this paper, we introduce RetouchingFFHQ, a large-scale and +fine-grained face retouching dataset that contains over half a million +conditionally-retouched images. RetouchingFFHQ stands out from previous +datasets due to its large scale, high quality, fine-grainedness, and +customization. By including four typical types of face retouching operations +and different retouching levels, we extend the binary face retouching detection +into a fine-grained, multi-retouching type, and multi-retouching level +estimation problem. Additionally, we propose a Multi-granularity Attention +Module (MAM) as a plugin for CNN backbones for enhanced cross-scale +representation learning. Extensive experiments using different baselines as +well as our proposed method on RetouchingFFHQ show decent performance on face +retouching detection. With the proposed new dataset, we believe there is great +potential for future work to tackle the challenging problem of real-world +fine-grained face retouching detection. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model CVPR 2023 + + +
+ Multimodal semantic understanding often has to deal with uncertainty, which +means the obtained messages tend to refer to multiple targets. Such uncertainty +is problematic for our interpretation, including inter- and intra-modal +uncertainty. Little effort has studied the modeling of this uncertainty, +particularly in pre-training on unlabeled datasets and fine-tuning in +task-specific downstream datasets. In this paper, we project the +representations of all modalities as probabilistic distributions via a +Probability Distribution Encoder (PDE) by utilizing sequence-level +interactions. Compared to the existing deterministic methods, such uncertainty +modeling can convey richer multimodal semantic information and more complex +relationships. Furthermore, we integrate uncertainty modeling with popular +pre-training frameworks and propose suitable pre-training tasks: +Distribution-based Vision-Language Contrastive learning (D-VLC), +Distribution-based Masked Language Modeling (D-MLM), and Distribution-based +Image-Text Matching (D-ITM). The fine-tuned models are applied to challenging +downstream tasks, including image-text retrieval, visual question answering, +visual reasoning, and visual entailment, and achieve state-of-the-art results. + +
+
+ comment: CVPR 2023 Main Track Long Paper +
+
+
+
+
+ + ♻ ☆ Positive-Augmented Contrastive Learning for Image and Video Captioning + Evaluation CVPR 2023 + + +
+ The CLIP model has been recently proven to be very effective for a variety of +cross-modal tasks, including the evaluation of captions generated from +vision-and-language architectures. In this paper, we propose a new recipe for a +contrastive-based evaluation metric for image captioning, namely +Positive-Augmented Contrastive learning Score (PAC-S), that in a novel way +unifies the learning of a contrastive visual-semantic space with the addition +of generated images and text on curated data. Experiments spanning several +datasets demonstrate that our new metric achieves the highest correlation with +human judgments on both images and videos, outperforming existing +reference-based metrics like CIDEr and SPICE and reference-free metrics like +CLIP-Score. Finally, we test the system-level correlation of the proposed +metric when considering popular image captioning approaches, and assess the +impact of employing different cross-modal features. Our source code and trained +models are publicly available at: https://github.com/aimagelab/pacscore. + +
+
+ comment: CVPR 2023 (highlight paper) +
+
+
+
+
+ + ♻ ☆ Emotion-Conditioned Melody Harmonization with Hierarchical Variational + Autoencoder + + +
+ Existing melody harmonization models have made great progress in improving +the quality of generated harmonies, but most of them ignored the emotions +beneath the music. Meanwhile, the variability of harmonies generated by +previous methods is insufficient. To solve these problems, we propose a novel +LSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the +influence of emotional conditions on melody harmonization, while improving the +quality of generated harmonies and capturing the abundant variability of chord +progressions. Specifically, LHVAE incorporates latent variables and emotional +conditions at different levels (piece- and bar-level) to model the global and +local music properties. Additionally, we introduce an attention-based melody +context vector at each step to better learn the correspondence between melodies +and harmonies. Objective experimental results show that our proposed model +outperforms other LSTM-based models. Through subjective evaluation, we conclude +that only altering the types of chords hardly changes the overall emotion of +the music. The qualitative analysis demonstrates the ability of our model to +generate variable harmonies. + +
+
+ comment: Accepted by IEEE SMC 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 50 + +
+
+
+ + ☆ DialogStudio: Towards Richest and Most Diverse Unified Dataset + Collection for Conversational AI + + +
+ Despite advancements in conversational AI, language models encounter +challenges to handle diverse conversational tasks, and existing dialogue +dataset collections often lack diversity and comprehensiveness. To tackle these +issues, we introduce DialogStudio: the largest and most diverse collection of +dialogue datasets, unified under a consistent format while preserving their +original information. Our collection encompasses data from open-domain +dialogues, task-oriented dialogues, natural language understanding, +conversational recommendation, dialogue summarization, and knowledge-grounded +dialogues, making it an incredibly rich and diverse resource for dialogue +research and model training. To further enhance the utility of DialogStudio, we +identify the licenses for each dataset and design domain-aware prompts for +selected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we +develop conversational AI models using the dataset collection, and our +experiments in both zero-shot and few-shot learning scenarios demonstrate the +superiority of DialogStudio. To improve transparency and support dataset and +task-based research, as well as language model pre-training, all datasets, +licenses, codes, and models associated with DialogStudio are made publicly +accessible at https://github.com/salesforce/DialogStudio + +
+
+
+
+
+ + ☆ Challenges and Applications of Large Language Models + + +
+ Large Language Models (LLMs) went from non-existent to ubiquitous in the +machine learning discourse within a few years. Due to the fast pace of the +field, it is difficult to identify the remaining challenges and already +fruitful application areas. In this paper, we aim to establish a systematic set +of open problems and application successes so that ML researchers can +comprehend the field's current state more quickly and become productive. + +
+
+ comment: 72 pages. v01. Work in progress. Feedback and comments are highly + appreciated! +
+
+
+
+
+ + ☆ LLMs as Workers in Human-Computational Algorithms? Replicating + Crowdsourcing Pipelines with LLMs + + +
+ LLMs have shown promise in replicating human-like behavior in crowdsourcing +tasks that were previously thought to be exclusive to human abilities. However, +current efforts focus mainly on simple atomic tasks. We explore whether LLMs +can replicate more complex crowdsourcing pipelines. We find that modern LLMs +can simulate some of crowdworkers' abilities in these "human computation +algorithms," but the level of success is variable and influenced by requesters' +understanding of LLM capabilities, the specific skills required for sub-tasks, +and the optimal interaction modality for performing these sub-tasks. We reflect +on human and LLMs' different sensitivities to instructions, stress the +importance of enabling human-facing safeguards for LLMs, and discuss the +potential of training humans and LLMs with complementary skill sets. Crucially, +we show that replicating crowdsourcing pipelines offers a valuable platform to +investigate (1) the relative strengths of LLMs on different tasks (by +cross-comparing their performances on sub-tasks) and (2) LLMs' potential in +complex tasks, where they can complete part of the tasks while leaving others +to humans. + +
+
+
+
+
+ + ☆ Exploring Transformer Extrapolation + + +
+ Length extrapolation has attracted considerable attention recently since it +allows transformers to be tested on longer sequences than those used in +training. Previous research has shown that this property can be attained by +using carefully designed Relative Positional Encodings (RPEs). While these +methods perform well on a variety of corpora, the conditions for length +extrapolation have yet to be investigated. This paper attempts to determine +what types of RPEs allow for length extrapolation through a thorough +mathematical and empirical analysis. We discover that a transformer is certain +to possess this property as long as the series that corresponds to the RPE's +exponential converges. Two practices are derived from the conditions and +examined in language modeling tasks on a variety of corpora. As a bonus from +the conditions, we derive a new Theoretical Receptive Field (TRF) to measure +the receptive field of RPEs without taking any training steps. Extensive +experiments are conducted on the Wikitext-103, Books, Github, and WikiBook +datasets to demonstrate the viability of our discovered conditions. We also +compare TRF to Empirical Receptive Field (ERF) across different models, showing +consistently matched trends on the aforementioned datasets. The code is +available at https://github.com/OpenNLPLab/Rpe. + +
+
+ comment: Zhen Qin and Yiran Zhong contribute equally to this paper; Yiran + Zhong is the corresponding author. The code is available at + https://github.com/OpenNLPLab/Rpe +
+
+
+
+
+ + ☆ Gradient Sparsification For Masked Fine-Tuning of Transformers IJCNN 2023 + + +
+ Fine-tuning pretrained self-supervised language models is widely adopted for +transfer learning to downstream tasks. Fine-tuning can be achieved by freezing +gradients of the pretrained network and only updating gradients of a newly +added classification layer, or by performing gradient updates on all +parameters. Gradual unfreezing makes a trade-off between the two by gradually +unfreezing gradients of whole layers during training. This has been an +effective strategy to trade-off between storage and training speed with +generalization performance. However, it is not clear whether gradually +unfreezing layers throughout training is optimal, compared to sparse variants +of gradual unfreezing which may improve fine-tuning performance. In this paper, +we propose to stochastically mask gradients to regularize pretrained language +models for improving overall fine-tuned performance. We introduce GradDrop and +variants thereof, a class of gradient sparsification methods that mask +gradients during the backward pass, acting as gradient noise. GradDrop is +sparse and stochastic unlike gradual freezing. Extensive experiments on the +multilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive +against methods that use additional translated data for intermediate +pretraining and outperforms standard fine-tuning and gradual unfreezing. A +post-analysis shows how GradDrop improves performance with languages it was not +trained on, such as under-resourced languages. + +
+
+ comment: Accepted to IJCNN 2023 +
+
+
+
+
+ + ☆ Android in the Wild: A Large-Scale Dataset for Android Device Control + + +
+ There is a growing interest in device-control systems that can interpret +human natural language instructions and execute them on a digital device by +directly controlling its user interface. We present a dataset for +device-control research, Android in the Wild (AITW), which is orders of +magnitude larger than current datasets. The dataset contains human +demonstrations of device interactions, including the screens and actions, and +corresponding natural language instructions. It consists of 715k episodes +spanning 30k unique instructions, four versions of Android (v10-13),and eight +device types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It +contains multi-step tasks that require semantic understanding of language and +visual context. This dataset poses a new challenge: actions available through +the user interface must be inferred from their visual appearance. And, instead +of simple UI element-based actions, the action space consists of precise +gestures (e.g., horizontal scrolls to operate carousel widgets). We organize +our dataset to encourage robustness analysis of device-control systems, i.e., +how well a system performs in the presence of new task descriptions, new +applications, or new platform versions. We develop two agents and report +performance across the dataset. The dataset is available at +https://github.com/google-research/google-research/tree/master/android_in_the_wild. + +
+
+
+
+
+ + ☆ An Empirical Study on Fertility Proposals Using Multi-Grined Topic + Analysis Methods + + +
+ Fertility issues are closely related to population security, in 60 years +China's population for the first time in a negative growth trend, the change of +fertility policy is of great concern to the community. 2023 ``two sessions" +proposal ``suggests that the country in the form of legislation, the birth of +the registration of the cancellation of the marriage restriction" This topic +was once a hot topic on the Internet, and ``unbundling" the relationship +between birth registration and marriage has become the focus of social debate. +In this paper, we adopt co-occurrence semantic analysis, topic analysis and +sentiment analysis to conduct multi-granularity semantic analysis of microblog +comments. It is found that the discussion on the proposal of ``removing +marriage restrictions from birth registration" involves the individual, society +and the state at three dimensions, and is detailed into social issues such as +personal behaviour, social ethics and law, and national policy, with people's +sentiment inclined to be negative in most of the topics. Based on this, eight +proposals were made to provide a reference for governmental decision making and +to form a reference method for researching public opinion on political issues. + +
+
+ comment: 7 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ Generating Mathematical Derivations with Large Language Models + + +
+ The derivation of mathematical results in specialised fields using Large +Language Models (LLMs) is an emerging research direction that can help identify +models' limitations, and potentially support mathematical discovery. In this +paper, we leverage a symbolic engine to generate derivations of equations at +scale, and investigate the capabilities of LLMs when deriving goal equations +from premises. Specifically, we employ in-context learning for GPT and +fine-tune a range of T5 models to compare the robustness and generalisation of +pre-training strategies to specialised models. Empirical results show that +fine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and +out-of-distribution test sets in terms of absolute performance. However, an +in-depth analysis reveals that the fine-tuned models are more sensitive to +perturbations involving unseen symbols and (to a lesser extent) changes to +equation structure. In addition, we analyse 1.7K equations and over 200 +derivations to highlight common reasoning errors such as the inclusion of +incorrect, irrelevant, and redundant equations, along with the tendency to skip +derivation steps. Finally, we explore the suitability of existing metrics for +evaluating mathematical derivations finding evidence that, while they capture +general properties such as sensitivity to perturbations, they fail to highlight +fine-grained reasoning errors and essential differences between models. +Overall, this work demonstrates that training models on synthetic data can +improve their mathematical capabilities beyond larger architectures. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ GUIDO: A Hybrid Approach to Guideline Discovery & Ordering from Natural + Language Texts + + +
+ Extracting workflow nets from textual descriptions can be used to simplify +guidelines or formalize textual descriptions of formal processes like business +processes and algorithms. The task of manually extracting processes, however, +requires domain expertise and effort. While automatic process model extraction +is desirable, annotating texts with formalized process models is expensive. +Therefore, there are only a few machine-learning-based extraction approaches. +Rule-based approaches, in turn, require domain specificity to work well and can +rarely distinguish relevant and irrelevant information in textual descriptions. +In this paper, we present GUIDO, a hybrid approach to the process model +extraction task that first, classifies sentences regarding their relevance to +the process model, using a BERT-based sentence classifier, and second, extracts +a process model from the sentences classified as relevant, using dependency +parsing. The presented approach achieves significantly better results than a +pure rule-based approach. GUIDO achieves an average behavioral similarity score +of $0.93$. Still, in comparison to purely machine-learning-based approaches, +the annotation costs stay low. + +
+
+ comment: Preprint of the short paper presented at the 12th International + Conference on Data Science, Technology and Applications +
+
+
+
+
+ + ☆ Large Language Models can accomplish Business Process Management Tasks + + +
+ Business Process Management (BPM) aims to improve organizational activities +and their outcomes by managing the underlying processes. To achieve this, it is +often necessary to consider information from various sources, including +unstructured textual documents. Therefore, researchers have developed several +BPM-specific solutions that extract information from textual documents using +Natural Language Processing techniques. These solutions are specific to their +respective tasks and cannot accomplish multiple process-related problems as a +general-purpose instrument. However, in light of the recent emergence of Large +Language Models (LLMs) with remarkable reasoning capabilities, such a +general-purpose instrument with multiple applications now appears attainable. +In this paper, we illustrate how LLMs can accomplish text-related BPM tasks by +applying a specific LLM to three exemplary tasks: mining imperative process +models from textual descriptions, mining declarative process models from +textual descriptions, and assessing the suitability of process tasks from +textual descriptions for robotic process automation. We show that, without +extensive configuration or prompt engineering, LLMs perform comparably to or +better than existing solutions and discuss implications for future BPM research +as well as practical usage. + +
+
+ comment: Accepted at NLP4BPM workshop at BPM 2023 +
+
+
+
+
+ + ☆ Test-takers have a say: understanding the implications of the use of AI + in language tests + + +
+ Language tests measure a person's ability to use a language in terms of +listening, speaking, reading, or writing. Such tests play an integral role in +academic, professional, and immigration domains, with entities such as +educational institutions, professional accreditation bodies, and governments +using them to assess candidate language proficiency. Recent advances in +Artificial Intelligence (AI) and the discipline of Natural Language Processing +have prompted language test providers to explore AI's potential applicability +within language testing, leading to transformative activity patterns +surrounding language instruction and learning. However, with concerns over AI's +trustworthiness, it is imperative to understand the implications of integrating +AI into language testing. This knowledge will enable stakeholders to make +well-informed decisions, thus safeguarding community well-being and testing +integrity. To understand the concerns and effects of AI usage in language +tests, we conducted interviews and surveys with English test-takers. To the +best of our knowledge, this is the first empirical study aimed at identifying +the implications of AI adoption in language tests from a test-taker +perspective. Our study reveals test-taker perceptions and behavioral patterns. +Specifically, we identify that AI integration may enhance perceptions of +fairness, consistency, and availability. Conversely, it might incite mistrust +regarding reliability and interactivity aspects, subsequently influencing the +behaviors and well-being of test-takers. These insights provide a better +understanding of potential societal implications and assist stakeholders in +making informed decisions concerning AI usage in language testing. + +
+
+
+
+
+ + ☆ DAPrompt: Deterministic Assumption Prompt Learning for Event Causality + Identification + + +
+ Event Causality Identification (ECI) aims at determining whether there is a +causal relation between two event mentions. Conventional prompt learning +designs a prompt template to first predict an answer word and then maps it to +the final decision. Unlike conventional prompts, we argue that predicting an +answer word may not be a necessary prerequisite for the ECI task. Instead, we +can first make a deterministic assumption on the existence of causal relation +between two events and then evaluate its rationality to either accept or reject +the assumption. The design motivation is to try the most utilization of the +encyclopedia-like knowledge embedded in a pre-trained language model. In light +of such considerations, we propose a deterministic assumption prompt learning +model, called DAPrompt, for the ECI task. In particular, we design a simple +deterministic assumption template concatenating with the input event pair, +which includes two masks as predicted events' tokens. We use the probabilities +of predicted events to evaluate the assumption rationality for the final event +causality decision. Experiments on the EventStoryLine corpus and +Causal-TimeBank corpus validate our design objective in terms of significant +performance improvements over the state-of-the-art algorithms. + +
+
+
+
+
+ + ☆ On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large + Language Models + + +
+ Since late 2022, Large Language Models (LLMs) have become very prominent with +LLMs like ChatGPT and Bard receiving millions of users. Hundreds of new LLMs +are announced each week, many of which are deposited to Hugging Face, a +repository of machine learning models and datasets. To date, nearly 16,000 Text +Generation models have been uploaded to the site. Given the huge influx of +LLMs, it is of interest to know which LLM backbones, settings, training +methods, and families are popular or trending. However, there is no +comprehensive index of LLMs available. We take advantage of the relatively +systematic nomenclature of Hugging Face LLMs to perform hierarchical clustering +and identify communities amongst LLMs using n-grams and term frequency-inverse +document frequency. Our methods successfully identify families of LLMs and +accurately cluster LLMs into meaningful subgroups. We present a public web +application to navigate and explore Constellation, our atlas of 15,821 LLMs. +Constellation rapidly generates a variety of visualizations, namely +dendrograms, graphs, word clouds, and scatter plots. Constellation is available +at the following link: https://constellation.sites.stanford.edu/. + +
+
+ comment: 14 pages, 6 figures, 1 table +
+
+
+
+
+ + ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization + Using Floating-Point Formats + + +
+ In the complex domain of large language models (LLMs), striking a balance +between computational efficiency and maintaining model quality is a formidable +challenge. Navigating the inherent limitations of uniform quantization, +particularly when dealing with outliers, and motivated by the launch of +NVIDIA's H100 hardware, this study delves into the viability of floating-point +(FP) quantization, particularly focusing on FP8 and FP4, as a potential +solution. Our comprehensive investigation reveals that for LLMs, FP8 activation +consistently outshines its integer (INT8) equivalent, with the performance edge +becoming more noticeable in models possessing parameters beyond one billion. +For weight quantization, our findings indicate that FP4 exhibits comparable, if +not superior, performance to INT4, simplifying deployment on FP-supported +hardware like H100. To mitigate the overhead from precision alignment caused by +the disparity between weights and activations, we propose two scaling +constraints for weight quantization that negligibly impact the performance +compared to the standard W4A8 model. We additionally enhance our quantization +methods by integrating the Low Rank Compensation (LoRC) strategy, yielding +improvements especially in smaller models. The results of our investigation +emphasize the immense potential of FP quantization for LLMs, paving the way for +high-efficiency deployment in resource-limited settings. + +
+
+
+
+
+ + ☆ Enhancing conversational quality in language learning chatbots: An + evaluation of GPT4 for ASR error correction + + +
+ The integration of natural language processing (NLP) technologies into +educational applications has shown promising results, particularly in the +language learning domain. Recently, many spoken open-domain chatbots have been +used as speaking partners, helping language learners improve their language +skills. However, one of the significant challenges is the high word-error-rate +(WER) when recognizing non-native/non-fluent speech, which interrupts +conversation flow and leads to disappointment for learners. This paper explores +the use of GPT4 for ASR error correction in conversational settings. In +addition to WER, we propose to use semantic textual similarity (STS) and next +response sensibility (NRS) metrics to evaluate the impact of error correction +models on the quality of the conversation. We find that transcriptions +corrected by GPT4 lead to higher conversation quality, despite an increase in +WER. GPT4 also outperforms standard error correction methods without the need +for in-domain training data. + +
+
+
+
+
+ + ☆ RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap ACL + + +
+ Taxonomies are an essential knowledge representation, yet most studies on +automatic taxonomy construction (ATC) resort to manual evaluation to score +proposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just +as important as taxonomy construction. We propose RaTE, an automatic label-free +taxonomy scoring procedure, which relies on a large pre-trained language model. +We apply our evaluation procedure to three state-of-the-art ATC algorithms with +which we built seven taxonomies from the Yelp domain, and show that 1) RaTE +correlates well with human judgments and 2) artificially degrading a taxonomy +leads to decreasing RaTE score. + +
+
+ comment: 15th International Conference on Computational Semantics (IWCS), + Association for Computational Linguistics (ACL) +
+
+
+
+
+ + ☆ CValues: Measuring the Values of Chinese Large Language Models from + Safety to Responsibility + + +
+ With the rapid evolution of large language models (LLMs), there is a growing +concern that they may pose risks or have negative social impacts. Therefore, +evaluation of human values alignment is becoming increasingly important. +Previous work mainly focuses on assessing the performance of LLMs on certain +knowledge and reasoning abilities, while neglecting the alignment to human +values, especially in a Chinese context. In this paper, we present CValues, the +first Chinese human values evaluation benchmark to measure the alignment +ability of LLMs in terms of both safety and responsibility criteria. As a +result, we have manually collected adversarial safety prompts across 10 +scenarios and induced responsibility prompts from 8 domains by professional +experts. To provide a comprehensive values evaluation of Chinese LLMs, we not +only conduct human evaluation for reliable comparison, but also construct +multi-choice prompts for automatic evaluation. Our findings suggest that while +most Chinese LLMs perform well in terms of safety, there is considerable room +for improvement in terms of responsibility. Moreover, both the automatic and +human evaluation are important for assessing the human values alignment in +different aspects. The benchmark and code is available on ModelScope and +Github. + +
+
+ comment: Working in Process +
+
+
+
+
+ + ☆ Efficient Guided Generation for LLMs + + +
+ In this article we describe an efficient approach to guiding language model +text generation with regular expressions and context-free grammars. Our +approach adds little to no overhead to the token sequence generation process, +and makes guided generation feasible in practice. An implementation is provided +in the open source Python library Outlines. + +
+
+
+
+
+ + ☆ Efficiency Pentathlon: A Standardized Arena for Efficiency Evaluation + + +
+ Rising computational demands of modern natural language processing (NLP) +systems have increased the barrier to entry for cutting-edge research while +posing serious environmental concerns. Yet, progress on model efficiency has +been impeded by practical challenges in model evaluation and comparison. For +example, hardware is challenging to control due to disparate levels of +accessibility across different institutions. Moreover, improvements in metrics +such as FLOPs often fail to translate to progress in real-world applications. +In response, we introduce Pentathlon, a benchmark for holistic and realistic +evaluation of model efficiency. Pentathlon focuses on inference, which accounts +for a majority of the compute in a model's lifecycle. It offers a +strictly-controlled hardware platform, and is designed to mirror real-world +applications scenarios. It incorporates a suite of metrics that target +different aspects of efficiency, including latency, throughput, memory +overhead, and energy consumption. Pentathlon also comes with a software library +that can be seamlessly integrated into any codebase and enable evaluation. As a +standardized and centralized evaluation platform, Pentathlon can drastically +reduce the workload to make fair and reproducible efficiency comparisons. While +initially focused on natural language processing (NLP) models, Pentathlon is +designed to allow flexible extension to other fields. We envision Pentathlon +will stimulate algorithmic innovations in building efficient models, and foster +an increased awareness of the social and environmental implications in the +development of future-generation NLP models. + +
+
+
+
+
+ + ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in + Multi-Modal LLMs + + +
+ We demonstrate how images and sounds can be used for indirect prompt and +instruction injection in multi-modal LLMs. An attacker generates an adversarial +perturbation corresponding to the prompt and blends it into an image or audio +recording. When the user asks the (unmodified, benign) model about the +perturbed image or audio, the perturbation steers the model to output the +attacker-chosen text and/or make the subsequent dialog follow the attacker's +instruction. We illustrate this attack with several proof-of-concept examples +targeting LLaVa and PandaGPT. + +
+
+
+
+
+ + ☆ SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot + Neural Sparse Retrieval SIGIR 2023 + + +
+ Traditionally, sparse retrieval systems relied on lexical representations to +retrieve documents, such as BM25, dominated information retrieval tasks. With +the onset of pre-trained transformer models such as BERT, neural sparse +retrieval has led to a new paradigm within retrieval. Despite the success, +there has been limited software supporting different sparse retrievers running +in a unified, common environment. This hinders practitioners from fairly +comparing different sparse models and obtaining realistic evaluation results. +Another missing piece is, that a majority of prior work evaluates sparse +retrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO. +However, a key requirement in practical retrieval systems requires models that +can generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In +this work, we provide SPRINT, a unified Python toolkit based on Pyserini and +Lucene, supporting a common interface for evaluating neural sparse retrieval. +The toolkit currently includes five built-in models: uniCOIL, DeepImpact, +SPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by +defining their term weighting method. Using our toolkit, we establish strong +and reproducible zero-shot sparse retrieval baselines across the +well-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2 +achieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural +sparse retrievers. In this work, we further uncover the reasons behind its +performance gain. We show that SPLADEv2 produces sparse representations with a +majority of tokens outside of the original query and document which is often +crucial for its performance gains, i.e. a limitation among its other sparse +counterparts. We provide our SPRINT toolkit, models, and data used in our +experiments publicly here at https://github.com/thakur-nandan/sprint. + +
+
+ comment: Accepted at SIGIR 2023 (Resource Track) +
+
+
+
+
+ + ☆ FinGPT: Democratizing Internet-scale Data for Financial Large Language + Models + + +
+ Large language models (LLMs) have demonstrated remarkable proficiency in +understanding and generating human-like texts, which may potentially +revolutionize the finance industry. However, existing LLMs often fall short in +the financial field, which is mainly attributed to the disparities between +general text data and financial text data. Unfortunately, there is only a +limited number of financial text datasets available (quite small size), and +BloombergGPT, the first financial LLM (FinLLM), is close-sourced (only the +training logs were released). In light of this, we aim to democratize +Internet-scale financial data for LLMs, which is an open challenge due to +diverse data sources, low signal-to-noise ratio, and high time-validity. To +address the challenges, we introduce an open-sourced and data-centric +framework, \textit{Financial Generative Pre-trained Transformer (FinGPT)}, that +automates the collection and curation of real-time financial data from >34 +diverse sources on the Internet, providing researchers and practitioners with +accessible and transparent resources to develop their FinLLMs. Additionally, we +propose a simple yet effective strategy for fine-tuning FinLLM using the +inherent feedback from the market, dubbed Reinforcement Learning with Stock +Prices (RLSP). We also adopt the Low-rank Adaptation (LoRA, QLoRA) method that +enables users to customize their own FinLLMs from open-source general-purpose +LLMs at a low cost. Finally, we showcase several FinGPT applications, including +robo-advisor, sentiment analysis for algorithmic trading, and low-code +development. FinGPT aims to democratize FinLLMs, stimulate innovation, and +unlock new opportunities in open finance. The codes are available at +https://github.com/AI4Finance-Foundation/FinGPT and +https://github.com/AI4Finance-Foundation/FinNLP + +
+
+ comment: 43 pages, 9 tables, and 3 figures +
+
+
+
+
+ + ☆ What can we learn from Data Leakage and Unlearning for Law? ICML'23 + + +
+ Large Language Models (LLMs) have a privacy concern because they memorize +training data (including personally identifiable information (PII) like emails +and phone numbers) and leak it during inference. A company can train an LLM on +its domain-customized data which can potentially also include their users' PII. +In order to comply with privacy laws such as the "right to be forgotten", the +data points of users that are most vulnerable to extraction could be deleted. +We find that once the most vulnerable points are deleted, a new set of points +become vulnerable to extraction. So far, little attention has been given to +understanding memorization for fine-tuned models. In this work, we also show +that not only do fine-tuned models leak their training data but they also leak +the pre-training data (and PII) memorized during the pre-training phase. The +property of new data points becoming vulnerable to extraction after unlearning +and leakage of pre-training data through fine-tuned models can pose significant +privacy and legal concerns for companies that use LLMs to offer services. We +hope this work will start an interdisciplinary discussion within AI and law +communities regarding the need for policies to tackle these issues. + +
+
+ comment: 5 pages, 8 figures, accepted to the first GenLaw workshop at ICML'23, + Hawai'i +
+
+
+
+
+ + ☆ Findings of Factify 2: Multimodal Fake News Detection AAAI 2023 + + +
+ With social media usage growing exponentially in the past few years, fake +news has also become extremely prevalent. The detrimental impact of fake news +emphasizes the need for research focused on automating the detection of false +information and verifying its accuracy. In this work, we present the outcome of +the Factify 2 shared task, which provides a multi-modal fact verification and +satire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data +calls for a comparison based approach to the task by pairing social media +claims with supporting documents, with both text and image, divided into 5 +classes based on multi-modal relations. In the second iteration of this task we +had over 60 participants and 9 final test-set submissions. The best +performances came from the use of DeBERTa for text and Swinv2 and CLIP for +image. The highest F1 score averaged for all five classes was 81.82%. + +
+
+ comment: Defactify2 @AAAI 2023 +
+
+
+
+
+ + ☆ Can Instruction Fine-Tuned Language Models Identify Social Bias through + Prompting? + + +
+ As the breadth and depth of language model applications continue to expand +rapidly, it is increasingly important to build efficient frameworks for +measuring and mitigating the learned or inherited social biases of these +models. In this paper, we present our work on evaluating instruction fine-tuned +language models' ability to identify bias through zero-shot prompting, +including Chain-of-Thought (CoT) prompts. Across LLaMA and its two instruction +fine-tuned versions, Alpaca 7B performs best on the bias identification task +with an accuracy of 56.7%. We also demonstrate that scaling up LLM size and +data diversity could lead to further performance gain. This is a +work-in-progress presenting the first component of our bias mitigation +framework. We will keep updating this work as we get more results. + +
+
+
+
+
+ + ☆ Improving Pre-trained Language Models' Generalization + + +
+ The reusability of state-of-the-art Pre-trained Language Models (PLMs) is +often limited by their generalization problem, where their performance +drastically decreases when evaluated on examples that differ from the training +dataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation +arises from PLMs' reliance on spurious correlations, which work well for +frequent example types but not for general examples. To address this issue, we +propose a training approach called Mask-tuning, which integrates Masked +Language Modeling (MLM) training objectives into the fine-tuning process to +enhance PLMs' generalization. Comprehensive experiments demonstrate that +Mask-tuning surpasses current state-of-the-art techniques and enhances PLMs' +generalization on OOD datasets while improving their performance on +in-distribution datasets. The findings suggest that Mask-tuning improves the +reusability of PLMs on unseen data, making them more practical and effective +for real-world applications. + +
+
+
+
+
+ + ☆ Integrating a Heterogeneous Graph with Entity-aware Self-attention using + Relative Position Labels for Reading Comprehension Model + + +
+ Despite the significant progress made by transformer models in machine +reading comprehension tasks, they still face limitations in handling complex +reasoning tasks due to the absence of explicit knowledge in the input sequence. +This paper proposes a novel attention pattern to overcome this limitation, +which integrates reasoning knowledge derived from a heterogeneous graph into +the transformer architecture using a graph-enhanced self-attention mechanism. +The proposed attention pattern comprises three key elements: global-local +attention for word tokens, graph attention for entity tokens that exhibit +strong attention towards tokens connected in the graph as opposed to those +unconnected, and the consideration of the type of relationship between each +entity token and word token. This results in optimized attention between the +two if a relationship exists. The pattern is coupled with special relative +position labels, allowing it to integrate with LUKE's entity-aware +self-attention mechanism. The experimental findings corroborate that our model +outperforms both the cutting-edge LUKE-Graph and the baseline LUKE model on the +ReCoRD dataset that focuses on commonsense reasoning. + +
+
+ comment: submitted for Knowledge-Based Systems Journal +
+
+
+
+
+ + ☆ Thrust: Adaptively Propels Large Language Models with External Knowledge + + +
+ Although large-scale pre-trained language models (PTLMs) are shown to encode +rich knowledge in their model parameters, the inherent knowledge in PTLMs can +be opaque or static, making external knowledge necessary. However, the existing +information retrieval techniques could be costly and may even introduce noisy +and sometimes misleading knowledge. To address these challenges, we propose the +instance-level adaptive propulsion of external knowledge (IAPEK), where we only +conduct the retrieval when necessary. To achieve this goal, we propose +measuring whether a PTLM contains enough knowledge to solve an instance with a +novel metric, Thrust, which leverages the representation distribution of a +small number of seen instances. Extensive experiments demonstrate that thrust +is a good measurement of PTLM models' instance-level knowledgeability. +Moreover, we can achieve significantly higher cost-efficiency with the Thrust +score as the retrieval indicator than the naive usage of external knowledge on +88% of the evaluated tasks with 26% average performance improvement. Such +findings shed light on the real-world practice of knowledge-enhanced LMs with a +limited knowledge-seeking budget due to computation latency or costs. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ PharmacyGPT: The AI Pharmacist + + +
+ In this study, we introduce PharmacyGPT, a novel framework to assess the +capabilities of large language models (LLMs) such as ChatGPT and GPT-4 in +emulating the role of clinical pharmacists. Our methodology encompasses the +utilization of LLMs to generate comprehensible patient clusters, formulate +medication plans, and forecast patient outcomes. We conduct our investigation +using real data acquired from the intensive care unit (ICU) at the University +of North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable +insights into the potential applications and limitations of LLMs in the field +of clinical pharmacy, with implications for both patient care and the +development of future AI-driven healthcare solutions. By evaluating the +performance of PharmacyGPT, we aim to contribute to the ongoing discourse +surrounding the integration of artificial intelligence in healthcare settings, +ultimately promoting the responsible and efficacious use of such technologies. + +
+
+
+
+
+ + ☆ IncDSI: Incrementally Updatable Document Retrieval + + +
+ Differentiable Search Index is a recently proposed paradigm for document +retrieval, that encodes information about a corpus of documents within the +parameters of a neural network and directly maps queries to corresponding +documents. These models have achieved state-of-the-art performances for +document retrieval across many benchmarks. These kinds of models have a +significant limitation: it is not easy to add new documents after a model is +trained. We propose IncDSI, a method to add documents in real time (about +20-50ms per document), without retraining the model on the entire dataset (or +even parts thereof). Instead we formulate the addition of documents as a +constrained optimization problem that makes minimal changes to the network +parameters. Although orders of magnitude faster, our approach is competitive +with re-training the model on the whole dataset and enables the development of +document retrieval systems that can be updated with new information in +real-time. Our code for IncDSI is available at +https://github.com/varshakishore/IncDSI. + +
+
+
+
+
+ + ☆ Mood Classification of Bangla Songs Based on Lyrics + + +
+ Music can evoke various emotions, and with the advancement of technology, it +has become more accessible to people. Bangla music, which portrays different +human emotions, lacks sufficient research. The authors of this article aim to +analyze Bangla songs and classify their moods based on the lyrics. To achieve +this, this research has compiled a dataset of 4000 Bangla song lyrics, genres, +and used Natural Language Processing and the Bert Algorithm to analyze the +data. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362 +for the romantic mood, 886 for happiness, and the rest 239 are classified as +relaxation. By embedding the lyrics of the songs, the authors have classified +the songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is +crucial as it enables a multi-class classification of songs' moods, making the +music more relatable to people's emotions. The article presents the automated +result of the four moods accurately derived from the song lyrics. + +
+
+ comment: Presented at International Conference on. Inventive Communication and + Computational Technologies 2023 +
+
+
+
+
+ + ♻ ☆ Llama 2: Open Foundation and Fine-Tuned Chat Models + + +
+ In this work, we develop and release Llama 2, a collection of pretrained and +fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 +billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for +dialogue use cases. Our models outperform open-source chat models on most +benchmarks we tested, and based on our human evaluations for helpfulness and +safety, may be a suitable substitute for closed-source models. We provide a +detailed description of our approach to fine-tuning and safety improvements of +Llama 2-Chat in order to enable the community to build on our work and +contribute to the responsible development of LLMs. + +
+
+
+
+
+ + ♻ ☆ ThoughtSource: A central hub for large language model reasoning data + + +
+ Large language models (LLMs) such as GPT-4 have recently demonstrated +impressive results across a wide range of tasks. LLMs are still limited, +however, in that they frequently fail at complex reasoning, their reasoning +processes are opaque, they are prone to 'hallucinate' facts, and there are +concerns about their underlying biases. Letting models verbalize reasoning +steps as natural language, a technique known as chain-of-thought prompting, has +recently been proposed as a way to address some of these issues. Here we +present ThoughtSource, a meta-dataset and software library for chain-of-thought +(CoT) reasoning. The goal of ThoughtSource is to improve future artificial +intelligence systems by facilitating qualitative understanding of CoTs, +enabling empirical evaluations, and providing training data. This first release +of ThoughtSource integrates six scientific/medical, three general-domain and +five math word question answering datasets. + +
+
+ comment: Revision: added datasets, minor restructuring +
+
+
+
+
+ + ♻ ☆ A comparative analysis of SRGAN models + + +
+ In this study, we evaluate the performance of multiple state-of-the-art SRGAN +(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN +and EDSR, on a benchmark dataset of real-world images which undergo degradation +using a pipeline. Our results show that some models seem to significantly +increase the resolution of the input images while preserving their visual +quality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE +model from huggingface outperforms the remaining candidate models in terms of +both quantitative metrics and subjective visual quality assessments with least +compute overhead. Specifically, EDSR generates images with higher peak +signal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and +are seen to return high quality OCR results with Tesseract OCR engine. These +findings suggest that EDSR is a robust and effective approach for single-image +super-resolution and may be particularly well-suited for applications where +high-quality visual fidelity is critical and optimized compute. + +
+
+ comment: 9 pages, 6 tables, 2 figures +
+
+
+
+
+ + ♻ ☆ ChatGPT Outperforms Crowd-Workers for Text-Annotation Tasks + + +
+ Many NLP applications require manual data annotations for a variety of tasks, +notably to train classifiers or evaluate the performance of unsupervised +models. Depending on the size and degree of complexity, the tasks may be +conducted by crowd-workers on platforms such as MTurk as well as trained +annotators, such as research assistants. Using a sample of 2,382 tweets, we +demonstrate that ChatGPT outperforms crowd-workers for several annotation +tasks, including relevance, stance, topics, and frames detection. Specifically, +the zero-shot accuracy of ChatGPT exceeds that of crowd-workers for four out of +five tasks, while ChatGPT's intercoder agreement exceeds that of both +crowd-workers and trained annotators for all tasks. Moreover, the +per-annotation cost of ChatGPT is less than $0.003 -- about twenty times +cheaper than MTurk. These results show the potential of large language models +to drastically increase the efficiency of text classification. + +
+
+ comment: Gilardi, Fabrizio, Meysam Alizadeh, and Ma\"el Kubli. 2023. "ChatGPT + Outperforms Crowd Workers for Text-Annotation Tasks". Proceedings of the + National Academy of Sciences 120(30): e2305016120 +
+
+
+
+
+ + ♻ ☆ Revisiting Softmax for Uncertainty Approximation in Text Classification + + +
+ Uncertainty approximation in text classification is an important area with +applications in domain adaptation and interpretability. One of the most widely +used uncertainty approximation methods is Monte Carlo (MC) Dropout, which is +computationally expensive as it requires multiple forward passes through the +model. A cheaper alternative is to simply use the softmax based on a single +forward pass without dropout to estimate model uncertainty. However, prior work +has indicated that these predictions tend to be overconfident. In this paper, +we perform a thorough empirical analysis of these methods on five datasets with +two base neural architectures in order to identify the trade-offs between the +two. We compare both softmax and an efficient version of MC Dropout on their +uncertainty approximations and downstream text classification performance, +while weighing their runtime (cost) against performance (benefit). We find +that, while MC dropout produces the best uncertainty approximations, using a +simple softmax leads to competitive and in some cases better uncertainty +estimation for text classification at a much lower computational cost, +suggesting that softmax can in fact be a sufficient uncertainty estimate when +computational resources are a concern. + +
+
+
+
+
+ + ♻ ☆ LongNet: Scaling Transformers to 1,000,000,000 Tokens + + +
+ Scaling sequence length has become a critical demand in the era of large +language models. However, existing methods struggle with either computational +complexity or model expressivity, rendering the maximum sequence length +restricted. To address this issue, we introduce LongNet, a Transformer variant +that can scale sequence length to more than 1 billion tokens, without +sacrificing the performance on shorter sequences. Specifically, we propose +dilated attention, which expands the attentive field exponentially as the +distance grows. LongNet has significant advantages: 1) it has a linear +computation complexity and a logarithm dependency between any two tokens in a +sequence; 2) it can be served as a distributed trainer for extremely long +sequences; 3) its dilated attention is a drop-in replacement for standard +attention, which can be seamlessly integrated with the existing +Transformer-based optimization. Experiments results demonstrate that LongNet +yields strong performance on both long-sequence modeling and general language +tasks. Our work opens up new possibilities for modeling very long sequences, +e.g., treating a whole corpus or even the entire Internet as a sequence. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation + Evaluation + + +
+ Research in Image Generation has recently made significant progress, +particularly boosted by the introduction of Vision-Language models which are +able to produce high-quality visual content based on textual inputs. Despite +ongoing advancements in terms of generation quality and realism, no methodical +frameworks have been defined yet to quantitatively measure the quality of the +generated content and the adherence with the prompted requests: so far, only +human-based evaluations have been adopted for quality satisfaction and for +comparing different generative methods. We introduce a novel automated method +for Visual Concept Evaluation (ViCE), i.e. to assess consistency between a +generated/edited image and the corresponding prompt/instructions, with a +process inspired by the human cognitive behaviour. ViCE combines the strengths +of Large Language Models (LLMs) and Visual Question Answering (VQA) into a +unified pipeline, aiming to replicate the human cognitive process in quality +assessment. This method outlines visual concepts, formulates image-specific +verification questions, utilizes the Q&A system to investigate the image, and +scores the combined outcome. Although this brave new hypothesis of mimicking +humans in the image evaluation process is in its preliminary assessment stage, +results are promising and open the door to a new form of automatic evaluation +which could have significant impact as the image generation or the image target +editing tasks become more and more sophisticated. + +
+
+ comment: Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track) +
+
+
+
+
+ + ♻ ☆ Can In-context Learners Learn a Reasoning Concept from Demonstrations? ACL 2023 + + +
+ Language models exhibit an emergent ability to learn a new task from a small +number of input-output demonstrations. However, recent work shows that +in-context learners largely rely on their pre-trained knowledge, such as the +sentiment of the labels, instead of learning new associations from the input. +We argue that the commonly-used few-shot evaluation using a random selection of +in-context demonstrations can not disentangle models' reliance on such biases, +as most of the randomly-selected demonstrations do not present relations +informative for prediction beyond exposing the task's input-output +distribution. + Therefore, to evaluate models' in-context learning ability independent of +models' memory, we introduce a Concept-sharing few-shot learning method +choosing the demonstrations that share an underlying concept with the predicted +sample. We extract a set of such concepts from available human explanations and +measure how much models can benefit from presenting these concepts in few-shot +demonstrations. + We find that most of the recent in-context learners can not consistently +benefit from the demonstrated concepts, irrespective of the model size. +However, we note that T0 models are more sensitive to exhibited concepts, +benefiting from concept-sharing demonstrations in 7 out of 8 evaluation +scenarios. + +
+
+ comment: Awarded Best Paper at ACL 2023 Natural Language Reasoning and + Structured Explanations (NLRSE) workshop +
+
+
+
+
+ + ♻ ☆ Retentive Network: A Successor to Transformer for Large Language Models + + +
+ In this work, we propose Retentive Network (RetNet) as a foundation +architecture for large language models, simultaneously achieving training +parallelism, low-cost inference, and good performance. We theoretically derive +the connection between recurrence and attention. Then we propose the retention +mechanism for sequence modeling, which supports three computation paradigms, +i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel +representation allows for training parallelism. The recurrent representation +enables low-cost $O(1)$ inference, which improves decoding throughput, latency, +and GPU memory without sacrificing performance. The chunkwise recurrent +representation facilitates efficient long-sequence modeling with linear +complexity, where each chunk is encoded parallelly while recurrently +summarizing the chunks. Experimental results on language modeling show that +RetNet achieves favorable scaling results, parallel training, low-cost +deployment, and efficient inference. The intriguing properties make RetNet a +strong successor to Transformer for large language models. Code will be +available at https://aka.ms/retnet. + +
+
+
+
+
+ + ♻ ☆ Lego-MT: Learning Detachable Models for Massively Multilingual Machine + Translation ACL 2023 + + +
+ Multilingual neural machine translation (MNMT) aims to build a unified model +for many language directions. Existing monolithic models for MNMT encounter two +challenges: parameter interference among languages and inefficient inference +for large models. In this paper, we revisit the classic multi-way structures +and develop a detachable model by assigning each language (or group of +languages) to an individual branch that supports plug-and-play training and +inference. To address the needs of learning representations for all languages +in a unified space, we propose a novel efficient training recipe, upon which we +build an effective detachable model, Lego-MT. For a fair comparison, we collect +data from OPUS and build a translation benchmark covering 433 languages and +1.3B parallel data. Experiments show that Lego-MT with 1.2B parameters brings +an average gain of 3.2 spBLEU. It even outperforms M2M-100 with 12B parameters. +The proposed training recipe brings a 28.2$\times$ speedup over the +conventional multi-way training method.\footnote{ +\url{https://github.com/CONE-MT/Lego-MT}.} + +
+
+ comment: ACL 2023 Findings +
+
+
+
+
+ + ♻ ☆ Understand Legal Documents with Contextualized Large Language Models SemEval 2023 + + +
+ The growth of pending legal cases in populous countries, such as India, has +become a major issue. Developing effective techniques to process and understand +legal documents is extremely useful in resolving this problem. In this paper, +we present our systems for SemEval-2023 Task 6: understanding legal texts (Modi +et al., 2023). Specifically, we first develop the Legal-BERT-HSLN model that +considers the comprehensive context information in both intra- and +inter-sentence levels to predict rhetorical roles (subtask A) and then train a +Legal-LUKE model, which is legal-contextualized and entity-aware, to recognize +legal entities (subtask B). Our evaluations demonstrate that our designed +models are more accurate than baselines, e.g., with an up to 15.0% better F1 +score in subtask B. We achieved notable performance in the task leaderboard, +e.g., 0.834 micro F1 score, and ranked No.5 out of 27 teams in subtask A. + +
+
+ comment: SemEval 2023 +
+
+
+
+
+ + ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio + Pretraining for Speech Emotion Recognition + + +
+ Contrastive learning based cross-modality pretraining methods have recently +exhibited impressive success in diverse fields. In this paper, we propose +GEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio +pretraining (CLAP) method for speech emotion recognition. Specifically, a novel +emotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised +pre-trained models. Second, considering the importance of gender attribute in +speech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and +multi-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to +integrate the emotion and gender information of speech signals, forming more +reasonable objectives. Extensive experiments on IEMOCAP show that our proposed +two GEmo-CLAP models consistently outperform the baseline Emo-CLAP with +different pre-trained models, while also achieving the best recognition +performance compared with recent state-of-the-art methods. Noticeably, the +proposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\% and WAR of +82.06\%. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ Pseudo Outlier Exposure for Out-of-Distribution Detection using + Pretrained Transformers + + +
+ For real-world language applications, detecting an out-of-distribution (OOD) +sample is helpful to alert users or reject such unreliable samples. However, +modern over-parameterized language models often produce overconfident +predictions for both in-distribution (ID) and OOD samples. In particular, +language models suffer from OOD samples with a similar semantic representation +to ID samples since these OOD samples lie near the ID manifold. A rejection +network can be trained with ID and diverse outlier samples to detect test OOD +samples, but explicitly collecting auxiliary OOD datasets brings an additional +burden for data collection. In this paper, we propose a simple but effective +method called Pseudo Outlier Exposure (POE) that constructs a surrogate OOD +dataset by sequentially masking tokens related to ID classes. The surrogate OOD +sample introduced by POE shows a similar representation to ID data, which is +most effective in training a rejection network. Our method does not require any +external OOD data and can be easily implemented within off-the-shelf +Transformers. A comprehensive comparison with state-of-the-art algorithms +demonstrates POE's competitiveness on several text classification benchmarks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution + Generalizability ICCV + + +
+ Large vision-language models have achieved outstanding performance, but their +size and computational requirements make their deployment on +resource-constrained devices and time-sensitive tasks impractical. Model +distillation, the process of creating smaller, faster models that maintain the +performance of larger models, is a promising direction towards the solution. +This paper investigates the distillation of visual representations in large +teacher vision-language models into lightweight student models using a small- +or mid-scale dataset. Notably, this study focuses on open-vocabulary +out-of-distribution (OOD) generalization, a challenging problem that has been +overlooked in previous model distillation literature. We propose two principles +from vision and language modality perspectives to enhance student's OOD +generalization: (1) by better imitating teacher's visual representation space, +and carefully promoting better coherence in vision-language alignment with the +teacher; (2) by enriching the teacher's language representations with +informative and finegrained semantic attributes to effectively distinguish +between different labels. We propose several metrics and conduct extensive +experiments to investigate their techniques. The results demonstrate +significant improvements in zero-shot and few-shot student performance on +open-vocabulary out-of-distribution classification, highlighting the +effectiveness of our proposed approaches. Code released at +https://github.com/xuanlinli17/large_vlm_distillation_ood + +
+
+ comment: Published at International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ ChatGPT is Good but Bing Chat is Better for Vietnamese Students + + +
+ This study examines the efficacy of two SOTA large language models (LLMs), +namely ChatGPT and Microsoft Bing Chat (BingChat), in catering to the needs of +Vietnamese students. Although ChatGPT exhibits proficiency in multiple +disciplines, Bing Chat emerges as the more advantageous option. We conduct a +comparative analysis of their academic achievements in various disciplines, +encompassing mathematics, literature, English language, physics, chemistry, +biology, history, geography, and civic education. The results of our study +suggest that BingChat demonstrates superior performance compared to ChatGPT +across a wide range of subjects, with the exception of literature, where +ChatGPT exhibits better performance. Additionally, BingChat utilizes the more +advanced GPT-4 technology in contrast to ChatGPT, which is built upon GPT-3.5. +This allows BingChat to improve to comprehension, reasoning and generation of +creative and informative text. Moreover, the fact that BingChat is accessible +in Vietnam and its integration of hyperlinks and citations within responses +serve to reinforce its superiority. In our analysis, it is evident that while +ChatGPT exhibits praiseworthy qualities, BingChat presents a more apdated +solutions for Vietnamese students. + +
+
+ comment: 13 pages; 6 figures +
+
+
+
+
+ + ♻ ☆ Execution-based Code Generation using Deep Reinforcement Learning + + +
+ The utilization of programming language (PL) models, pre-trained on +large-scale code corpora, as a means of automating software engineering +processes has demonstrated considerable potential in streamlining various code +generation tasks such as code completion, code translation, and program +synthesis. However, current approaches mainly rely on supervised fine-tuning +objectives borrowed from text generation, neglecting unique sequence-level +characteristics of code, including but not limited to compilability as well as +syntactic and functional correctness. To address this limitation, we propose +PPOCoder, a new framework for code generation that synergistically combines +pre-trained PL models with Proximal Policy Optimization (PPO) which is a widely +used deep reinforcement learning technique. By utilizing non-differentiable +feedback from code execution and structure alignment, PPOCoder seamlessly +integrates external code-specific knowledge into the model optimization +process. It's important to note that PPOCoder is a task-agnostic and +model-agnostic framework that can be used across different code generation +tasks and PLs. Extensive experiments on three code generation tasks demonstrate +the effectiveness of our proposed approach compared to SOTA methods, achieving +significant improvements in compilation success rates and functional +correctness across different PLs. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR), 2023 +
+
+
+
+
+ + ♻ ☆ Fairness in AI and Its Long-Term Implications on Society + + +
+ Successful deployment of artificial intelligence (AI) in various settings has +led to numerous positive outcomes for individuals and society. However, AI +systems have also been shown to harm parts of the population due to biased +predictions. AI fairness focuses on mitigating such biases to ensure AI +decision making is not discriminatory towards certain groups. We take a closer +look at AI fairness and analyze how lack of AI fairness can lead to deepening +of biases over time and act as a social stressor. More specifically, we discuss +how biased models can lead to more negative real-world outcomes for certain +groups, which may then become more prevalent by deploying new AI models trained +on increasingly biased data, resulting in a feedback loop. If the issues +persist, they could be reinforced by interactions with other risks and have +severe implications on society in the form of social unrest. We examine current +strategies for improving AI fairness, assess their limitations in terms of +real-world deployment, and explore potential paths forward to ensure we reap +AI's benefits without causing society's collapse. + +
+
+ comment: Stanford Existential Risks Conference 2023 +
+
+
+
+
+ + ♻ ☆ ChatGPT for Robotics: Design Principles and Model Abilities + + +
+ This paper presents an experimental study regarding the use of OpenAI's +ChatGPT for robotics applications. We outline a strategy that combines design +principles for prompt engineering and the creation of a high-level function +library which allows ChatGPT to adapt to different robotics tasks, simulators, +and form factors. We focus our evaluations on the effectiveness of different +prompt engineering techniques and dialog strategies towards the execution of +various types of robotics tasks. We explore ChatGPT's ability to use free-form +dialog, parse XML tags, and to synthesize code, in addition to the use of +task-specific prompting functions and closed-loop reasoning through dialogues. +Our study encompasses a range of tasks within the robotics domain, from basic +logical, geometrical, and mathematical reasoning all the way to complex domains +such as aerial navigation, manipulation, and embodied agents. We show that +ChatGPT can be effective at solving several of such tasks, while allowing users +to interact with it primarily via natural language instructions. In addition to +these studies, we introduce an open-sourced research tool called PromptCraft, +which contains a platform where researchers can collaboratively upload and vote +on examples of good prompting schemes for robotics applications, as well as a +sample robotics simulator with ChatGPT integration, making it easier for users +to get started with using ChatGPT for robotics. + +
+
+
+
+
+ + ♻ ☆ Improving Text Matching in E-Commerce Search with A Rationalizable, + Intervenable and Fast Entity-Based Relevance Model + + +
+ Discovering the intended items of user queries from a massive repository of +items is one of the main goals of an e-commerce search system. Relevance +prediction is essential to the search system since it helps improve +performance. When online serving a relevance model, the model is required to +perform fast and accurate inference. Currently, the widely used models such as +Bi-encoder and Cross-encoder have their limitations in accuracy or inference +speed respectively. In this work, we propose a novel model called the +Entity-Based Relevance Model (EBRM). We identify the entities contained in an +item and decompose the QI (query-item) relevance problem into multiple QE +(query-entity) relevance problems; we then aggregate their results to form the +QI prediction using a soft logic formulation. The decomposition allows us to +use a Cross-encoder QE relevance module for high accuracy as well as cache QE +predictions for fast online inference. Utilizing soft logic makes the +prediction procedure interpretable and intervenable. We also show that +pretraining the QE module with auto-generated QE data from user logs can +further improve the overall performance. The proposed method is evaluated on +labeled data from e-commerce websites. Empirical results show that it achieves +promising improvements with computation efficiency. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 135 + +
+
+
+ + ☆ DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity + Human-centric Rendering ICCV2023 + + +
+ Realistic human-centric rendering plays a key role in both computer vision +and computer graphics. Rapid progress has been made in the algorithm aspect +over the years, yet existing human-centric rendering datasets and benchmarks +are rather impoverished in terms of diversity, which are crucial for rendering +effect. Researchers are usually constrained to explore and evaluate a small set +of rendering problems on current datasets, while real-world applications +require methods to be robust across different scenarios. In this work, we +present DNA-Rendering, a large-scale, high-fidelity repository of human +performance data for neural actor rendering. DNA-Rendering presents several +alluring attributes. First, our dataset contains over 1500 human subjects, 5000 +motion sequences, and 67.5M frames' data volume. Second, we provide rich assets +for each subject -- 2D/3D human body keypoints, foreground masks, SMPLX models, +cloth/accessory materials, multi-view images, and videos. These assets boost +the current method's accuracy on downstream rendering tasks. Third, we +construct a professional multi-view system to capture data, which contains 60 +synchronous cameras with max 4096 x 3000 resolution, 15 fps speed, and stern +camera calibration steps, ensuring high-quality resources for task training and +evaluation. Along with the dataset, we provide a large-scale and quantitative +benchmark in full-scale, with multiple tasks to evaluate the existing progress +of novel view synthesis, novel pose animation synthesis, and novel identity +rendering methods. In this manuscript, we describe our DNA-Rendering effort as +a revealing of new observations, challenges, and future directions to +human-centric rendering. The dataset, code, and benchmarks will be publicly +available at https://dna-rendering.github.io/ + +
+
+ comment: This paper is accepted by ICCV2023. Project page: + https://dna-rendering.github.io/ +
+
+
+
+
+ + ☆ Adversarial Latent Autoencoder with Self-Attention for Structural Image + Synthesis + + +
+ Generative Engineering Design approaches driven by Deep Generative Models +(DGM) have been proposed to facilitate industrial engineering processes. In +such processes, designs often come in the form of images, such as blueprints, +engineering drawings, and CAD models depending on the level of detail. DGMs +have been successfully employed for synthesis of natural images, e.g., +displaying animals, human faces and landscapes. However, industrial design +images are fundamentally different from natural scenes in that they contain +rich structural patterns and long-range dependencies, which are challenging for +convolution-based DGMs to generate. Moreover, DGM-driven generation process is +typically triggered based on random noisy inputs, which outputs unpredictable +samples and thus cannot perform an efficient industrial design exploration. We +tackle these challenges by proposing a novel model Self-Attention Adversarial +Latent Autoencoder (SA-ALAE), which allows generating feasible design images of +complex engineering parts. With SA-ALAE, users can not only explore novel +variants of an existing design, but also control the generation process by +operating in latent space. The potential of SA-ALAE is shown by generating +engineering blueprints in a real automotive design task. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ☆ Drone navigation and license place detection for vehicle location in + indoor spaces + + +
+ Millions of vehicles are transported every year, tightly parked in vessels or +boats. To reduce the risks of associated safety issues like fires, knowing the +location of vehicles is essential, since different vehicles may need different +mitigation measures, e.g. electric cars. This work is aimed at creating a +solution based on a nano-drone that navigates across rows of parked vehicles +and detects their license plates. We do so via a wall-following algorithm, and +a CNN trained to detect license plates. All computations are done in real-time +on the drone, which just sends position and detected images that allow the +creation of a 2D map with the position of the plates. Our solution is capable +of reading all plates across eight test cases (with several rows of plates, +different drone speeds, or low light) by aggregation of measurements across +several drone journeys. + +
+
+ comment: Published at VIII International Workshop on Artificial Intelligence + and Pattern Recognition, IWAIPR +
+
+
+
+
+ + ☆ Robust Driving Policy Learning with Guided Meta Reinforcement Learning SC 2023 + + +
+ Although deep reinforcement learning (DRL) has shown promising results for +autonomous navigation in interactive traffic scenarios, existing work typically +adopts a fixed behavior policy to control social vehicles in the training +environment. This may cause the learned driving policy to overfit the +environment, making it difficult to interact well with vehicles with different, +unseen behaviors. In this work, we introduce an efficient method to train +diverse driving policies for social vehicles as a single meta-policy. By +randomizing the interaction-based reward functions of social vehicles, we can +generate diverse objectives and efficiently train the meta-policy through +guiding policies that achieve specific objectives. We further propose a +training strategy to enhance the robustness of the ego vehicle's driving policy +using the environment where social vehicles are controlled by the learned +meta-policy. Our method successfully learns an ego driving policy that +generalizes well to unseen situations with out-of-distribution (OOD) social +agents' behaviors in a challenging uncontrolled T-intersection scenario. + +
+
+ comment: ITSC 2023 +
+
+
+
+
+ + ☆ FABRIC: Personalizing Diffusion Models with Iterative Feedback + + +
+ In an era where visual content generation is increasingly driven by machine +learning, the integration of human feedback into generative models presents +significant opportunities for enhancing user experience and output quality. +This study explores strategies for incorporating iterative human feedback into +the generative process of diffusion-based text-to-image models. We propose +FABRIC, a training-free approach applicable to a wide range of popular +diffusion models, which exploits the self-attention layer present in the most +widely used architectures to condition the diffusion process on a set of +feedback images. To ensure a rigorous assessment of our approach, we introduce +a comprehensive evaluation methodology, offering a robust mechanism to quantify +the performance of generative visual models that integrate human feedback. We +show that generation results improve over multiple rounds of iterative feedback +through exhaustive analysis, implicitly optimizing arbitrary user preferences. +The potential applications of these findings extend to fields such as +personalized content creation and customization. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Leveraging Visemes for Better Visual Speech Representation and Lip + Reading + + +
+ Lip reading is a challenging task that has many potential applications in +speech recognition, human-computer interaction, and security systems. However, +existing lip reading systems often suffer from low accuracy due to the +limitations of video features. In this paper, we propose a novel approach that +leverages visemes, which are groups of phonetically similar lip shapes, to +extract more discriminative and robust video features for lip reading. We +evaluate our approach on various tasks, including word-level and sentence-level +lip reading, and audiovisual speech recognition using the Arman-AV dataset, a +largescale Persian corpus. Our experimental results show that our viseme based +approach consistently outperforms the state-of-theart methods in all these +tasks. The proposed method reduces the lip-reading word error rate (WER) by +9.1% relative to the best previous method. + +
+
+
+
+
+ + ☆ An Improved NeuMIP with Better Accuracy + + +
+ Neural reflectance models are capable of accurately reproducing the +spatially-varying appearance of many real-world materials at different scales. +However, existing methods have difficulties handling highly glossy materials. +To address this problem, we introduce a new neural reflectance model which, +compared with existing methods, better preserves not only specular highlights +but also fine-grained details. To this end, we enhance the neural network +performance by encoding input data to frequency space, inspired by NeRF, to +better preserve the details. Furthermore, we introduce a gradient-based loss +and employ it in multiple stages, adaptive to the progress of the learning +phase. Lastly, we utilize an optional extension to the decoder network using +the Inception module for more accurate yet costly performance. We demonstrate +the effectiveness of our method using a variety of synthetic and real examples. + +
+
+
+
+
+ + ☆ General vs. Long-Tailed Age Estimation: An Approach to Kill Two Birds + with One Stone + + +
+ Facial age estimation has received a lot of attention for its diverse +application scenarios. Most existing studies treat each sample equally and aim +to reduce the average estimation error for the entire dataset, which can be +summarized as General Age Estimation. However, due to the long-tailed +distribution prevalent in the dataset, treating all samples equally will +inevitably bias the model toward the head classes (usually the adult with a +majority of samples). Driven by this, some works suggest that each class should +be treated equally to improve performance in tail classes (with a minority of +samples), which can be summarized as Long-tailed Age Estimation. However, +Long-tailed Age Estimation usually faces a performance trade-off, i.e., +achieving improvement in tail classes by sacrificing the head classes. In this +paper, our goal is to design a unified framework to perform well on both tasks, +killing two birds with one stone. To this end, we propose a simple, effective, +and flexible training paradigm named GLAE, which is two-fold. Our GLAE provides +a surprising improvement on Morph II, reaching the lowest MAE and CMAE of 1.14 +and 1.27 years, respectively. Compared to the previous best method, MAE dropped +by up to 34%, which is an unprecedented improvement, and for the first time, +MAE is close to 1 year old. Extensive experiments on other age benchmark +datasets, including CACD, MIVIA, and Chalearn LAP 2015, also indicate that GLAE +outperforms the state-of-the-art approaches significantly. + +
+
+
+
+
+ + ☆ Two Approaches to Supervised Image Segmentation + + +
+ Though performed almost effortlessly by humans, segmenting 2D gray-scale or +color images in terms of their constituent regions of interest +(e.g.~background, objects or portions of objects) constitutes one of the +greatest challenges in science and technology as a consequence of the involved +dimensionality reduction(3D to 2D), noise, reflections, shades, and occlusions, +among many other possible effects. While a large number of interesting +approaches have been respectively suggested along the last decades, it was +mainly with the more recent development of deep learning that more effective +and general solutions have been obtained, currently constituting the basic +comparison reference for this type of operation. Also developed recently, a +multiset-based methodology has been described that is capable of encouraging +performance that combines spatial accuracy, stability, and robustness while +requiring minimal computational resources (hardware and/or training and +recognition time). The interesting features of the latter methodology mostly +follow from the enhanced selectivity and sensitivity, as well as good +robustness to data perturbations and outliers, allowed by the coincidence +similarity index on which the multiset approach to supervised image +segmentation is based. After describing the deep learning and multiset +approaches, the present work develops two comparison experiments between them +which are primarily aimed at illustrating their respective main interesting +features when applied to the adopted specific type of data and parameter +configurations. While the deep learning approach confirmed its potential for +performing image segmentation, the alternative multiset methodology allowed for +encouraging accuracy while requiring little computational resources. + +
+
+ comment: 37 pages, 18 figures +
+
+
+
+
+ + ☆ Boundary-Refined Prototype Generation: A General End-to-End Paradigm for + Semi-Supervised Semantic Segmentation + + +
+ Prototype-based classification is a classical method in machine learning, and +recently it has achieved remarkable success in semi-supervised semantic +segmentation. However, the current approach isolates the prototype +initialization process from the main training framework, which appears to be +unnecessary. Furthermore, while the direct use of K-Means algorithm for +prototype generation has considered rich intra-class variance, it may not be +the optimal solution for the classification task. To tackle these problems, we +propose a novel boundary-refined prototype generation (BRPG) method, which is +incorporated into the whole training framework. Specifically, our approach +samples and clusters high- and low-confidence features separately based on a +confidence threshold, aiming to generate prototypes closer to the class +boundaries. Moreover, an adaptive prototype optimization strategy is introduced +to make prototype augmentation for categories with scattered feature +distributions. Extensive experiments on the PASCAL VOC 2012 and Cityscapes +datasets demonstrate the superiority and scalability of the proposed method, +outperforming the current state-of-the-art approaches. The code is available at +xxxxxxxxxxxxxx. + +
+
+ comment: 53 pages, 7 figures +
+
+
+
+
+ + ☆ Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D + Brain MRI Synthesis MICCAI 2023 + + +
+ Cross-modality medical image synthesis is a critical topic and has the +potential to facilitate numerous applications in the medical imaging field. +Despite recent successes in deep-learning-based generative models, most current +medical image synthesis methods rely on generative adversarial networks and +suffer from notorious mode collapse and unstable training. Moreover, the 2D +backbone-driven approaches would easily result in volumetric inconsistency, +while 3D backbones are challenging and impractical due to the tremendous memory +cost and training difficulty. In this paper, we introduce a new paradigm for +volumetric medical data synthesis by leveraging 2D backbones and present a +diffusion-based framework, Make-A-Volume, for cross-modality 3D medical image +synthesis. To learn the cross-modality slice-wise mapping, we employ a latent +diffusion model and learn a low-dimensional latent space, resulting in high +computational efficiency. To enable the 3D image synthesis and mitigate +volumetric inconsistency, we further insert a series of volumetric layers in +the 2D slice-mapping model and fine-tune them with paired 3D data. This +paradigm extends the 2D image diffusion model to a volumetric version with a +slightly increasing number of parameters and computation, offering a principled +solution for generic cross-modality 3D medical image synthesis. We showcase the +effectiveness of our Make-A-Volume framework on an in-house SWI-MRA brain MRI +dataset and a public T1-T2 brain MRI dataset. Experimental results demonstrate +that our framework achieves superior synthesis results with volumetric +consistency. + +
+
+ comment: Accepted by International Conference on Medical Image Computing and + Computer Assisted Intervention (MICCAI 2023). 10 pages, 4 figures +
+
+
+
+
+ + ☆ Unsupervised Accuracy Estimation of Deep Visual Models using + Domain-Adaptive Adversarial Perturbation without Source Samples ICCV 2023 + + +
+ Deploying deep visual models can lead to performance drops due to the +discrepancies between source and target distributions. Several approaches +leverage labeled source data to estimate target domain accuracy, but accessing +labeled source data is often prohibitively difficult due to data +confidentiality or resource limitations on serving devices. Our work proposes a +new framework to estimate model accuracy on unlabeled target data without +access to source data. We investigate the feasibility of using pseudo-labels +for accuracy estimation and evolve this idea into adopting recent advances in +source-free domain adaptation algorithms. Our approach measures the +disagreement rate between the source hypothesis and the target pseudo-labeling +function, adapted from the source hypothesis. We mitigate the impact of +erroneous pseudo-labels that may arise due to a high ideal joint hypothesis +risk by employing adaptive adversarial perturbation on the input of the target +model. Our proposed source-free framework effectively addresses the challenging +distribution shift scenarios and outperforms existing methods requiring source +data and labels for training. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Divert More Attention to Vision-Language Object Tracking + + +
+ Multimodal vision-language (VL) learning has noticeably pushed the tendency +toward generic intelligence owing to emerging large foundation models. However, +tracking, as a fundamental vision problem, surprisingly enjoys less bonus from +recent flourishing VL learning. We argue that the reasons are two-fold: the +lack of large-scale vision-language annotated videos and ineffective +vision-language interaction learning of current works. These nuisances motivate +us to design more effective vision-language representation for tracking, +meanwhile constructing a large database with language annotation for model +learning. Particularly, in this paper, we first propose a general attribute +annotation strategy to decorate videos in six popular tracking benchmarks, +which contributes a large-scale vision-language tracking database with more +than 23,000 videos. We then introduce a novel framework to improve tracking by +learning a unified-adaptive VL representation, where the cores are the proposed +asymmetric architecture search and modality mixer (ModaMixer). To further +improve VL representation, we introduce a contrastive loss to align different +modalities. To thoroughly evidence the effectiveness of our method, we +integrate the proposed framework on three tracking methods with different +designs, i.e., the CNN-based SiamCAR, the Transformer-based OSTrack, and the +hybrid structure TransT. The experiments demonstrate that our framework can +significantly improve all baselines on six benchmarks. Besides empirical +results, we theoretically analyze our approach to show its rationality. By +revealing the potential of VL representation, we expect the community to divert +more attention to VL tracking and hope to open more possibilities for future +tracking with diversified multimodal messages. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ Class Attention to Regions of Lesion for Imbalanced Medical Image + Recognition + + +
+ Automated medical image classification is the key component in intelligent +diagnosis systems. However, most medical image datasets contain plenty of +samples of common diseases and just a handful of rare ones, leading to major +class imbalances. Currently, it is an open problem in intelligent diagnosis to +effectively learn from imbalanced training data. In this paper, we propose a +simple yet effective framework, named \textbf{C}lass \textbf{A}ttention to +\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by +embedding attention into the training process of \textbf{C}onvolutional +\textbf{N}eural \textbf{N}etworks (CNNs). The proposed attention module helps +CNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn +their characteristics more effectively. In addition, this attention module +works only during the training phase and does not change the architecture of +the original network, so it can be directly combined with any existing CNN +architecture. The CARE framework needs bounding boxes to represent the lesion +regions of rare diseases. To alleviate the need for manual annotation, we +further developed variants of CARE by leveraging the traditional saliency +methods or a pretrained segmentation model for bounding box generation. Results +show that the CARE variants with automated bounding box generation are +comparable to the original CARE framework with \textit{manual} bounding box +annotations. A series of experiments on an imbalanced skin image dataset and a +pneumonia dataset indicates that our method can effectively help the network +focus on the lesion regions of rare diseases and remarkably improves the +classification performance of rare diseases. + +
+
+ comment: Accepted by Neurocomputing on July 2023. 37 pages +
+
+
+
+
+ + ☆ Towards Fair Face Verification: An In-depth Analysis of Demographic + Biases + + +
+ Deep learning-based person identification and verification systems have +remarkably improved in terms of accuracy in recent years; however, such +systems, including widely popular cloud-based solutions, have been found to +exhibit significant biases related to race, age, and gender, a problem that +requires in-depth exploration and solutions. This paper presents an in-depth +analysis, with a particular emphasis on the intersectionality of these +demographic factors. Intersectional bias refers to the performance +discrepancies w.r.t. the different combinations of race, age, and gender +groups, an area relatively unexplored in current literature. Furthermore, the +reliance of most state-of-the-art approaches on accuracy as the principal +evaluation metric often masks significant demographic disparities in +performance. To counter this crucial limitation, we incorporate five additional +metrics in our quantitative analysis, including disparate impact and +mistreatment metrics, which are typically ignored by the relevant +fairness-aware approaches. Results on the Racial Faces in-the-Wild (RFW) +benchmark indicate pervasive biases in face recognition systems, extending +beyond race, with different demographic factors yielding significantly +disparate outcomes. In particular, Africans demonstrate an 11.25% lower True +Positive Rate (TPR) compared to Caucasians, while only a 3.51% accuracy drop is +observed. Even more concerning, the intersections of multiple protected groups, +such as African females over 60 years old, demonstrate a +39.89% disparate +mistreatment rate compared to the highest Caucasians rate. By shedding light on +these biases and their implications, this paper aims to stimulate further +research towards developing fairer, more equitable face recognition and +verification systems. + +
+
+
+
+
+ + ☆ MODA: Mapping-Once Audio-driven Portrait Animation with Dual Attentions ICCV 2023 + + +
+ Audio-driven portrait animation aims to synthesize portrait videos that are +conditioned by given audio. Animating high-fidelity and multimodal video +portraits has a variety of applications. Previous methods have attempted to +capture different motion modes and generate high-fidelity portrait videos by +training different models or sampling signals from given videos. However, +lacking correlation learning between lip-sync and other movements (e.g., head +pose/eye blinking) usually leads to unnatural results. In this paper, we +propose a unified system for multi-person, diverse, and high-fidelity talking +portrait generation. Our method contains three stages, i.e., 1) Mapping-Once +network with Dual Attentions (MODA) generates talking representation from given +audio. In MODA, we design a dual-attention module to encode accurate mouth +movements and diverse modalities. 2) Facial composer network generates dense +and detailed face landmarks, and 3) temporal-guided renderer syntheses stable +videos. Extensive evaluations demonstrate that the proposed system produces +more natural and realistic video portraits compared to previous methods. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ TbExplain: A Text-based Explanation Method for Scene Classification + Models with the Statistical Prediction Correction + + +
+ The field of Explainable Artificial Intelligence (XAI) aims to improve the +interpretability of black-box machine learning models. Building a heatmap based +on the importance value of input features is a popular method for explaining +the underlying functions of such models in producing their predictions. +Heatmaps are almost understandable to humans, yet they are not without flaws. +Non-expert users, for example, may not fully understand the logic of heatmaps +(the logic in which relevant pixels to the model's prediction are highlighted +with different intensities or colors). Additionally, objects and regions of the +input image that are relevant to the model prediction are frequently not +entirely differentiated by heatmaps. In this paper, we propose a framework +called TbExplain that employs XAI techniques and a pre-trained object detector +to present text-based explanations of scene classification models. Moreover, +TbExplain incorporates a novel method to correct predictions and textually +explain them based on the statistics of objects in the input image when the +initial prediction is unreliable. To assess the trustworthiness and validity of +the text-based explanations, we conducted a qualitative experiment, and the +findings indicated that these explanations are sufficiently reliable. +Furthermore, our quantitative and qualitative experiments on TbExplain with +scene classification datasets reveal an improvement in classification accuracy +over ResNet variants. + +
+
+
+
+
+ + ☆ As large as it gets: Learning infinitely large Filters via Neural + Implicit Functions in the Fourier Domain + + +
+ Motivated by the recent trend towards the usage of larger receptive fields +for more context-aware neural networks in vision applications, we aim to +investigate how large these receptive fields really need to be. To facilitate +such study, several challenges need to be addressed, most importantly: (i) We +need to provide an effective way for models to learn large filters (potentially +as large as the input data) without increasing their memory consumption during +training or inference, (ii) the study of filter sizes has to be decoupled from +other effects such as the network width or number of learnable parameters, and +(iii) the employed convolution operation should be a plug-and-play module that +can replace any conventional convolution in a Convolutional Neural Network +(CNN) and allow for an efficient implementation in current frameworks. To +facilitate such models, we propose to learn not spatial but frequency +representations of filter weights as neural implicit functions, such that even +infinitely large filters can be parameterized by only a few learnable weights. +The resulting neural implicit frequency CNNs are the first models to achieve +results on par with the state-of-the-art on large image classification +benchmarks while executing convolutions solely in the frequency domain and can +be employed within any CNN architecture. They allow us to provide an extensive +analysis of the learned receptive fields. Interestingly, our analysis shows +that, although the proposed networks could learn very large convolution +kernels, the learned filters practically translate into well-localized and +relatively small convolution kernels in the spatial domain. + +
+
+
+
+
+ + ☆ TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical + Phase Recognition + + +
+ To enable context-aware computer assistance in the operating room of the +future, cognitive systems need to understand automatically which surgical phase +is being performed by the medical team. The primary source of information for +surgical phase recognition is typically video, which presents two challenges: +extracting meaningful features from the video stream and effectively modeling +temporal information in the sequence of visual features. For temporal modeling, +attention mechanisms have gained popularity due to their ability to capture +long-range dependencies. In this paper, we explore design choices for attention +in existing temporal models for surgical phase recognition and propose a novel +approach that does not resort to local attention or regularization of attention +weights: TUNeS is an efficient and simple temporal model that incorporates +self-attention at the coarsest stage of a U-Net-like structure. In addition, we +propose to train the feature extractor, a standard CNN, together with an LSTM +on preferably long video segments, i.e., with long temporal context. In our +experiments, all temporal models performed better on top of feature extractors +that were trained with longer temporal context. On top of these contextualized +features, TUNeS achieves state-of-the-art results on Cholec80. + +
+
+
+
+
+ + ☆ Impact of Disentanglement on Pruning Neural Networks SC + + +
+ Deploying deep learning neural networks on edge devices, to accomplish task +specific objectives in the real-world, requires a reduction in their memory +footprint, power consumption, and latency. This can be realized via efficient +model compression. Disentangled latent representations produced by variational +autoencoder (VAE) networks are a promising approach for achieving model +compression because they mainly retain task-specific information, discarding +useless information for the task at hand. We make use of the Beta-VAE framework +combined with a standard criterion for pruning to investigate the impact of +forcing the network to learn disentangled representations on the pruning +process for the task of classification. In particular, we perform experiments +on MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose +a path forward for future works. + +
+
+ comment: Presented in ISCS23 +
+
+
+
+
+ + ☆ TinyTrain: Deep Neural Network Training at the Extreme Edge + + +
+ On-device training is essential for user personalisation and privacy. With +the pervasiveness of IoT devices and microcontroller units (MCU), this task +becomes more challenging due to the constrained memory and compute resources, +and the limited availability of labelled user data. Nonetheless, prior works +neglect the data scarcity issue, require excessively long training time (e.g. a +few hours), or induce substantial accuracy loss ($\geq$10\%). We propose +TinyTrain, an on-device training approach that drastically reduces training +time by selectively updating parts of the model and explicitly coping with data +scarcity. TinyTrain introduces a task-adaptive sparse-update method that +dynamically selects the layer/channel based on a multi-objective criterion that +jointly captures user data, the memory, and the compute capabilities of the +target device, leading to high accuracy on unseen tasks with reduced +computation and memory footprint. TinyTrain outperforms vanilla fine-tuning of +the entire network by 3.6-5.0\% in accuracy, while reducing the backward-pass +memory and computation cost by up to 2,286$\times$ and 7.68$\times$, +respectively. Targeting broadly used real-world edge devices, TinyTrain +achieves 9.5$\times$ faster and 3.5$\times$ more energy-efficient training over +status-quo approaches, and 2.8$\times$ smaller memory footprint than SOTA +approaches, while remaining within the 1 MB memory envelope of MCU-grade +platforms. + +
+
+
+
+
+ + ☆ Lazy Visual Localization via Motion Averaging + + +
+ Visual (re)localization is critical for various applications in computer +vision and robotics. Its goal is to estimate the 6 degrees of freedom (DoF) +camera pose for each query image, based on a set of posed database images. +Currently, all leading solutions are structure-based that either explicitly +construct 3D metric maps from the database with structure-from-motion, or +implicitly encode the 3D information with scene coordinate regression models. +On the contrary, visual localization without reconstructing the scene in 3D +offers clear benefits. It makes deployment more convenient by reducing database +pre-processing time, releasing storage requirements, and remaining unaffected +by imperfect reconstruction, etc. In this technical report, we demonstrate that +it is possible to achieve high localization accuracy without reconstructing the +scene from the database. The key to achieving this owes to a tailored motion +averaging over database-query pairs. Experiments show that our visual +localization proposal, LazyLoc, achieves comparable performance against +state-of-the-art structure-based methods. Furthermore, we showcase the +versatility of LazyLoc, which can be easily extended to handle complex +configurations such as multi-query co-localization and camera rigs. + +
+
+
+
+
+ + ☆ U-CE: Uncertainty-aware Cross-Entropy for Semantic Segmentation + + +
+ Deep neural networks have shown exceptional performance in various tasks, but +their lack of robustness, reliability, and tendency to be overconfident pose +challenges for their deployment in safety-critical applications like autonomous +driving. In this regard, quantifying the uncertainty inherent to a model's +prediction is a promising endeavour to address these shortcomings. In this +work, we present a novel Uncertainty-aware Cross-Entropy loss (U-CE) that +incorporates dynamic predictive uncertainties into the training process by +pixel-wise weighting of the well-known cross-entropy loss (CE). Through +extensive experimentation, we demonstrate the superiority of U-CE over regular +CE training on two benchmark datasets, Cityscapes and ACDC, using two common +backbone architectures, ResNet-18 and ResNet-101. With U-CE, we manage to train +models that not only improve their segmentation performance but also provide +meaningful uncertainties after training. Consequently, we contribute to the +development of more robust and reliable segmentation models, ultimately +advancing the state-of-the-art in safety-critical applications and beyond. + +
+
+ comment: 10 pages, 3 figures, 7 tables, 1 algorithm +
+
+
+
+
+ + ☆ ProtoCaps: A Fast and Non-Iterative Capsule Network Routing Method + + +
+ Capsule Networks have emerged as a powerful class of deep learning +architectures, known for robust performance with relatively few parameters +compared to Convolutional Neural Networks (CNNs). However, their inherent +efficiency is often overshadowed by their slow, iterative routing mechanisms +which establish connections between Capsule layers, posing computational +challenges resulting in an inability to scale. In this paper, we introduce a +novel, non-iterative routing mechanism, inspired by trainable prototype +clustering. This innovative approach aims to mitigate computational complexity, +while retaining, if not enhancing, performance efficacy. Furthermore, we +harness a shared Capsule subspace, negating the need to project each +lower-level Capsule to each higher-level Capsule, thereby significantly +reducing memory requisites during training. Our approach demonstrates superior +results compared to the current best non-iterative Capsule Network and tests on +the Imagewoof dataset, which is too computationally demanding to handle +efficiently by iterative approaches. Our findings underscore the potential of +our proposed methodology in enhancing the operational efficiency and +performance of Capsule Networks, paving the way for their application in +increasingly complex computational scenarios. + +
+
+ comment: 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point + Clouds of Deformable Objects + + +
+ This paper focuses on motion prediction for point cloud sequences in the +challenging case of deformable 3D objects, such as human body motion. First, we +investigate the challenges caused by deformable shapes and complex motions +present in this type of representation, with the ultimate goal of understanding +the technical limitations of state-of-the-art models. From this understanding, +we propose an improved architecture for point cloud prediction of deformable 3D +objects. Specifically, to handle deformable shapes, we propose a graph-based +approach that learns and exploits the spatial structure of point clouds to +extract more representative features. Then we propose a module able to combine +the learned features in an adaptative manner according to the point cloud +movements. The proposed adaptative module controls the composition of local and +global motions for each point, enabling the network to model complex motions in +deformable 3D objects more effectively. We tested the proposed method on the +following datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG +and CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that +our method outperforms the current baseline methods given its improved ability +to model complex movements as well as preserve point cloud shape. Furthermore, +we demonstrate the generalizability of the proposed framework for dynamic +feature learning, by testing the framework for action recognition on the +MSRAction3D dataset and achieving results on-par with state-of-the-art methods + +
+
+
+
+
+ + ☆ Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to + Harness Spurious Features + + +
+ To avoid failures on out-of-distribution data, recent works have sought to +extract features that have a stable or invariant relationship with the label +across domains, discarding the "spurious" or unstable features whose +relationship with the label changes across domains. However, unstable features +often carry complementary information about the label that could boost +performance if used correctly in the test domain. Our main contribution is to +show that it is possible to learn how to use these unstable features in the +test domain without labels. In particular, we prove that pseudo-labels based on +stable features provide sufficient guidance for doing so, provided that stable +and unstable features are conditionally independent given the label. Based on +this theoretical insight, we propose Stable Feature Boosting (SFB), an +algorithm for: (i) learning a predictor that separates stable and +conditionally-independent unstable features; and (ii) using the stable-feature +predictions to adapt the unstable-feature predictions in the test domain. +Theoretically, we prove that SFB can learn an asymptotically-optimal predictor +without test-domain labels. Empirically, we demonstrate the effectiveness of +SFB on real and synthetic data. + +
+
+
+
+
+ + ☆ DISA: DIfferentiable Similarity Approximation for Universal Multimodal + Registration MICCAI 2023 + + +
+ Multimodal image registration is a challenging but essential step for +numerous image-guided procedures. Most registration algorithms rely on the +computation of complex, frequently non-differentiable similarity metrics to +deal with the appearance discrepancy of anatomical structures between imaging +modalities. Recent Machine Learning based approaches are limited to specific +anatomy-modality combinations and do not generalize to new settings. We propose +a generic framework for creating expressive cross-modal descriptors that enable +fast deformable global registration. We achieve this by approximating existing +metrics with a dot-product in the feature space of a small convolutional neural +network (CNN) which is inherently differentiable can be trained without +registered data. Our method is several orders of magnitude faster than local +patch-based metrics and can be directly applied in clinical settings by +replacing the similarity measure with the proposed one. Experiments on three +different datasets demonstrate that our approach generalizes well beyond the +training data, yielding a broad capture range even on unseen anatomies and +modality pairs, without the need for specialized retraining. We make our +training code and data publicly available. + +
+
+ comment: This preprint was submitted to MICCAI 2023. The Version of Record of + this contribution will be published in Springer LNCS +
+
+
+
+
+ + ☆ Measuring and Modeling Uncertainty Degree for Monocular Depth Estimation + + +
+ Effectively measuring and modeling the reliability of a trained model is +essential to the real-world deployment of monocular depth estimation (MDE) +models. However, the intrinsic ill-posedness and ordinal-sensitive nature of +MDE pose major challenges to the estimation of uncertainty degree of the +trained models. On the one hand, utilizing current uncertainty modeling methods +may increase memory consumption and are usually time-consuming. On the other +hand, measuring the uncertainty based on model accuracy can also be +problematic, where uncertainty reliability and prediction accuracy are not well +decoupled. In this paper, we propose to model the uncertainty of MDE models +from the perspective of the inherent probability distributions originating from +the depth probability volume and its extensions, and to assess it more fairly +with more comprehensive metrics. By simply introducing additional training +regularization terms, our model, with surprisingly simple formations and +without requiring extra modules or multiple inferences, can provide uncertainty +estimations with state-of-the-art reliability, and can be further improved when +combined with ensemble or sampling methods. A series of experiments demonstrate +the effectiveness of our methods. + +
+
+
+
+
+ + ☆ Embedded Heterogeneous Attention Transformer for Cross-lingual Image + Captioning + + +
+ Cross-lingual image captioning is confronted with both cross-lingual and +cross-modal challenges for multimedia analysis. The crucial issue in this task +is to model the global and local matching between the image and different +languages. Existing cross-modal embedding methods based on Transformer +architecture oversight the local matching between the image region and +monolingual words, not to mention in the face of a variety of differentiated +languages. Due to the heterogeneous property of the cross-modal and +cross-lingual task, we utilize the heterogeneous network to establish +cross-domain relationships and the local correspondences between the image and +different languages. In this paper, we propose an Embedded Heterogeneous +Attention Transformer (EHAT) to build reasoning paths bridging cross-domain for +cross-lingual image captioning and integrate into transformer. The proposed +EHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous +Attention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN +as the core network, models and infers cross-domain relationship anchored by +vision bounding box representation features to connect two languages word +features and learn the heterogeneous maps. MHCA and HCA implement cross-domain +integration in the encoder through the special heterogeneous attention and +enable single model to generate two language captioning. We test on MSCOCO +dataset to generate English and Chinese, which are most widely used and have +obvious difference between their language families. Our experiments show that +our method even achieve better than advanced monolingual methods. + +
+
+
+
+
+ + ☆ Implicit Identity Representation Conditioned Memory Compensation Network + for Talking Head video Generation ICCV2023 + + +
+ Talking head video generation aims to animate a human face in a still image +with dynamic poses and expressions using motion information derived from a +target-driving video, while maintaining the person's identity in the source +image. However, dramatic and complex motions in the driving video cause +ambiguous generation, because the still source image cannot provide sufficient +appearance information for occluded regions or delicate expression variations, +which produces severe artifacts and significantly degrades the generation +quality. To tackle this problem, we propose to learn a global facial +representation space, and design a novel implicit identity representation +conditioned memory compensation network, coined as MCNet, for high-fidelity +talking head generation.~Specifically, we devise a network module to learn a +unified spatial facial meta-memory bank from all training samples, which can +provide rich facial structure and appearance priors to compensate warped source +facial features for the generation. Furthermore, we propose an effective query +mechanism based on implicit identity representations learned from the discrete +keypoints of the source image. It can greatly facilitate the retrieval of more +correlated information from the memory bank for the compensation. Extensive +experiments demonstrate that MCNet can learn representative and complementary +facial memory, and can clearly outperform previous state-of-the-art talking +head generation methods on VoxCeleb1 and CelebV datasets. Please check our +\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Learning from Abstract Images: on the Importance of Occlusion in a + Minimalist Encoding of Human Poses + + +
+ Existing 2D-to-3D pose lifting networks suffer from poor performance in +cross-dataset benchmarks. Although the use of 2D keypoints joined by +"stick-figure" limbs has shown promise as an intermediate step, stick-figures +do not account for occlusion information that is often inherent in an image. In +this paper, we propose a novel representation using opaque 3D limbs that +preserves occlusion information while implicitly encoding joint locations. +Crucially, when training on data with accurate three-dimensional keypoints and +without part-maps, this representation allows training on abstract synthetic +images, with occlusion, from as many synthetic viewpoints as desired. The +result is a pose defined by limb angles rather than joint positions +$\unicode{x2013}$ because poses are, in the real world, independent of cameras +$\unicode{x2013}$ allowing us to predict poses that are completely independent +of camera viewpoint. The result provides not only an improvement in +same-dataset benchmarks, but a "quantum leap" in cross-dataset benchmarks. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ 3Deformer: A Common Framework for Image-Guided Mesh Deformation + + +
+ We propose 3Deformer, a general-purpose framework for interactive 3D shape +editing. Given a source 3D mesh with semantic materials, and a user-specified +semantic image, 3Deformer can accurately edit the source mesh following the +shape guidance of the semantic image, while preserving the source topology as +rigid as possible. Recent studies of 3D shape editing mostly focus on learning +neural networks to predict 3D shapes, which requires high-cost 3D training +datasets and is limited to handling objects involved in the datasets. Unlike +these studies, our 3Deformer is a non-training and common framework, which only +requires supervision of readily-available semantic images, and is compatible +with editing various objects unlimited by datasets. In 3Deformer, the source +mesh is deformed utilizing the differentiable renderer technique, according to +the correspondences between semantic images and mesh materials. However, +guiding complex 3D shapes with a simple 2D image incurs extra challenges, that +is, the deform accuracy, surface smoothness, geometric rigidity, and global +synchronization of the edited mesh should be guaranteed. To address these +challenges, we propose a hierarchical optimization architecture to balance the +global and local shape features, and propose further various strategies and +losses to improve properties of accuracy, smoothness, rigidity, and so on. +Extensive experiments show that our 3Deformer is able to produce impressive +results and reaches the state-of-the-art level. + +
+
+
+
+
+ + ☆ A reinforcement learning approach for VQA validation: an application to + diabetic macular edema grading + + +
+ Recent advances in machine learning models have greatly increased the +performance of automated methods in medical image analysis. However, the +internal functioning of such models is largely hidden, which hinders their +integration in clinical practice. Explainability and trust are viewed as +important aspects of modern methods, for the latter's widespread use in +clinical communities. As such, validation of machine learning models represents +an important aspect and yet, most methods are only validated in a limited way. +In this work, we focus on providing a richer and more appropriate validation +approach for highly powerful Visual Question Answering (VQA) algorithms. To +better understand the performance of these methods, which answer arbitrary +questions related to images, this work focuses on an automatic visual Turing +test (VTT). That is, we propose an automatic adaptive questioning method, that +aims to expose the reasoning behavior of a VQA algorithm. Specifically, we +introduce a reinforcement learning (RL) agent that observes the history of +previously asked questions, and uses it to select the next question to pose. We +demonstrate our approach in the context of evaluating algorithms that +automatically answer questions related to diabetic macular edema (DME) grading. +The experiments show that such an agent has similar behavior to a clinician, +whereby asking questions that are relevant to key clinical concepts. + +
+
+ comment: 16 pages (+ 23 pages supplementary material) +
+
+
+
+
+ + ☆ A3D: Adaptive, Accurate, and Autonomous Navigation for Edge-Assisted + Drones + + +
+ Accurate navigation is of paramount importance to ensure flight safety and +efficiency for autonomous drones. Recent research starts to use Deep Neural +Networks to enhance drone navigation given their remarkable predictive +capability for visual perception. However, existing solutions either run DNN +inference tasks on drones in situ, impeded by the limited onboard resource, or +offload the computation to external servers which may incur large network +latency. Few works consider jointly optimizing the offloading decisions along +with image transmission configurations and adapting them on the fly. In this +paper, we propose A3D, an edge server assisted drone navigation framework that +can dynamically adjust task execution location, input resolution, and image +compression ratio in order to achieve low inference latency, high prediction +accuracy, and long flight distances. Specifically, we first augment +state-of-the-art convolutional neural networks for drone navigation and define +a novel metric called Quality of Navigation as our optimization objective which +can effectively capture the above goals. We then design a deep reinforcement +learning based neural scheduler at the drone side for which an information +encoder is devised to reshape the state features and thus improve its learning +ability. To further support simultaneous multi-drone serving, we extend the +edge server design by developing a network-aware resource allocation algorithm, +which allows provisioning containerized resources aligned with drones' demand. +We finally implement a proof-of-concept prototype with realistic devices and +validate its performance in a real-world campus scene, as well as a simulation +environment for thorough evaluation upon AirSim. Extensive experimental results +show that A3D can reduce end-to-end latency by 28.06% and extend the flight +distance by up to 27.28% compared with non-adaptive solutions. + +
+
+ comment: Accepted by IEEE/ACM Transactions on Networking +
+
+
+
+
+ + ☆ BSDM: Background Suppression Diffusion Model for Hyperspectral Anomaly + Detection + + +
+ Hyperspectral anomaly detection (HAD) is widely used in Earth observation and +deep space exploration. A major challenge for HAD is the complex background of +the input hyperspectral images (HSIs), resulting in anomalies confused in the +background. On the other hand, the lack of labeled samples for HSIs leads to +poor generalization of existing HAD methods. This paper starts the first +attempt to study a new and generalizable background learning problem without +labeled samples. We present a novel solution BSDM (background suppression +diffusion model) for HAD, which can simultaneously learn latent background +distributions and generalize to different datasets for suppressing complex +background. It is featured in three aspects: (1) For the complex background of +HSIs, we design pseudo background noise and learn the potential background +distribution in it with a diffusion model (DM). (2) For the generalizability +problem, we apply a statistical offset module so that the BSDM adapts to +datasets of different domains without labeling samples. (3) For achieving +background suppression, we innovatively improve the inference process of DM by +feeding the original HSIs into the denoising network, which removes the +background as noise. Our work paves a new background suppression way for HAD +that can improve HAD performance without the prerequisite of manually labeled +data. Assessments and generalization experiments of four HAD methods on several +real HSI datasets demonstrate the above three unique properties of the proposed +method. The code is available at https://github.com/majitao-xd/BSDM-HAD. + +
+
+
+
+
+ + ☆ Blind Image Quality Assessment Using Multi-Stream Architecture with + Spatial and Channel Attention + + +
+ BIQA (Blind Image Quality Assessment) is an important field of study that +evaluates images automatically. Although significant progress has been made, +blind image quality assessment remains a difficult task since images vary in +content and distortions. Most algorithms generate quality without emphasizing +the important region of interest. In order to solve this, a multi-stream +spatial and channel attention-based algorithm is being proposed. This algorithm +generates more accurate predictions with a high correlation to human perceptual +assessment by combining hybrid features from two different backbones, followed +by spatial and channel attention to provide high weights to the region of +interest. Four legacy image quality assessment datasets are used to validate +the effectiveness of our proposed approach. Authentic and synthetic distortion +image databases are used to demonstrate the effectiveness of the proposed +method, and we show that it has excellent generalization properties with a +particular focus on the perceptual foreground information. + +
+
+
+
+
+ + ☆ Hierarchical Spatio-Temporal Representation Learning for Gait + Recognition ICCV2023 + + +
+ Gait recognition is a biometric technique that identifies individuals by +their unique walking styles, which is suitable for unconstrained environments +and has a wide range of applications. While current methods focus on exploiting +body part-based representations, they often neglect the hierarchical +dependencies between local motion patterns. In this paper, we propose a +hierarchical spatio-temporal representation learning (HSTL) framework for +extracting gait features from coarse to fine. Our framework starts with a +hierarchical clustering analysis to recover multi-level body structures from +the whole body to local details. Next, an adaptive region-based motion +extractor (ARME) is designed to learn region-independent motion features. The +proposed HSTL then stacks multiple ARMEs in a top-down manner, with each ARME +corresponding to a specific partition level of the hierarchy. An adaptive +spatio-temporal pooling (ASTP) module is used to capture gait features at +different levels of detail to perform hierarchical feature mapping. Finally, a +frame-level temporal aggregation (FTA) module is employed to reduce redundant +information in gait sequences through multi-scale temporal downsampling. +Extensive experiments on CASIA-B, OUMVLP, GREW, and Gait3D datasets demonstrate +that our method outperforms the state-of-the-art while maintaining a reasonable +balance between model accuracy and complexity. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ Cryo-forum: A framework for orientation recovery with uncertainty + measure with the application in cryo-EM image analysis + + +
+ In single-particle cryo-electron microscopy (cryo-EM), the efficient +determination of orientation parameters for 2D projection images poses a +significant challenge yet is crucial for reconstructing 3D structures. This +task is complicated by the high noise levels present in the cryo-EM datasets, +which often include outliers, necessitating several time-consuming 2D clean-up +processes. Recently, solutions based on deep learning have emerged, offering a +more streamlined approach to the traditionally laborious task of orientation +estimation. These solutions often employ amortized inference, eliminating the +need to estimate parameters individually for each image. However, these methods +frequently overlook the presence of outliers and may not adequately concentrate +on the components used within the network. This paper introduces a novel +approach that uses a 10-dimensional feature vector to represent the orientation +and applies a Quadratically-Constrained Quadratic Program to derive the +predicted orientation as a unit quaternion, supplemented by an uncertainty +metric. Furthermore, we propose a unique loss function that considers the +pairwise distances between orientations, thereby enhancing the accuracy of our +method. Finally, we also comprehensively evaluate the design choices involved +in constructing the encoder network, a topic that has not received sufficient +attention in the literature. Our numerical analysis demonstrates that our +methodology effectively recovers orientations from 2D cryo-EM images in an +end-to-end manner. Importantly, the inclusion of uncertainty quantification +allows for direct clean-up of the dataset at the 3D level. Lastly, we package +our proposed methods into a user-friendly software suite named cryo-forum, +designed for easy accessibility by the developers. + +
+
+ comment: 27 pages, 9 figures +
+
+
+
+
+ + ☆ Compressive Image Scanning Microscope SC + + +
+ We present a novel approach to implement compressive sensing in laser +scanning microscopes (LSM), specifically in image scanning microscopy (ISM), +using a single-photon avalanche diode (SPAD) array detector. Our method +addresses two significant limitations in applying compressive sensing to LSM: +the time to compute the sampling matrix and the quality of reconstructed +images. We employ a fixed sampling strategy, skipping alternate rows and +columns during data acquisition, which reduces the number of points scanned by +a factor of four and eliminates the need to compute different sampling +matrices. By exploiting the parallel images generated by the SPAD array, we +improve the quality of the reconstructed compressive-ISM images compared to +standard compressive confocal LSM images. Our results demonstrate the +effectiveness of our approach in producing higher-quality images with reduced +data acquisition time and potential benefits in reducing photobleaching. + +
+
+ comment: Presented in ISCS23 +
+
+
+
+
+ + ☆ What do neural networks learn in image classification? A frequency + shortcut perspective ICCV2023 + + +
+ Frequency analysis is useful for understanding the mechanisms of +representation learning in neural networks (NNs). Most research in this area +focuses on the learning dynamics of NNs for regression tasks, while little for +classification. This study empirically investigates the latter and expands the +understanding of frequency shortcuts. First, we perform experiments on +synthetic datasets, designed to have a bias in different frequency bands. Our +results demonstrate that NNs tend to find simple solutions for classification, +and what they learn first during training depends on the most distinctive +frequency characteristics, which can be either low- or high-frequencies. +Second, we confirm this phenomenon on natural images. We propose a metric to +measure class-wise frequency characteristics and a method to identify frequency +shortcuts. The results show that frequency shortcuts can be texture-based or +shape-based, depending on what best simplifies the objective. Third, we +validate the transferability of frequency shortcuts on out-of-distribution +(OOD) test sets. Our results suggest that frequency shortcuts can be +transferred across datasets and cannot be fully avoided by larger model +capacity and data augmentation. We recommend that future research should focus +on effective training schemes mitigating frequency shortcut learning. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ☆ Online Continual Learning for Robust Indoor Object Recognition IROS 2023 + + +
+ Vision systems mounted on home robots need to interact with unseen classes in +changing environments. Robots have limited computational resources, labelled +data and storage capability. These requirements pose some unique challenges: +models should adapt without forgetting past knowledge in a data- and +parameter-efficient way. We characterize the problem as few-shot (FS) online +continual learning (OCL), where robotic agents learn from a non-repeated stream +of few-shot data updating only a few model parameters. Additionally, such +models experience variable conditions at test time, where objects may appear in +different poses (e.g., horizontal or vertical) and environments (e.g., day or +night). To improve robustness of CL agents, we propose RobOCLe, which; 1) +constructs an enriched feature space computing high order statistical moments +from the embedded features of samples; and 2) computes similarity between high +order statistics of the samples on the enriched feature space, and predicts +their class labels. We evaluate robustness of CL models to train/test +augmentations in various cases. We show that different moments allow RobOCLe to +capture different properties of deformations, providing higher robustness with +no decrease of inference speed. + +
+
+ comment: IROS 2023 +
+
+
+
+
+ + ☆ Multi-modal Learning based Prediction for Disease + + +
+ Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic +liver disease, which can be predicted accurately to prevent advanced fibrosis +and cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is +invasive, expensive, and prone to sampling errors. Therefore, non-invasive +studies are extremely promising, yet they are still in their infancy due to the +lack of comprehensive research data and intelligent methods for multi-modal +data. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a +comprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD +prediction method (DeepFLD). The dataset includes over 6000 participants +physical examinations, laboratory and imaging studies, extensive +questionnaires, and facial images of partial participants, which is +comprehensive and valuable for clinical studies. From the dataset, we +quantitatively analyze and select clinical metadata that most contribute to +NAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network +model designed to predict NAFLD using multi-modal input, including metadata and +facial images, outperforms the approach that only uses metadata. Satisfactory +performance is also verified on other unseen datasets. Inspiringly, DeepFLD can +achieve competitive results using only facial images as input rather than +metadata, paving the way for a more robust and simpler non-invasive NAFLD +diagnosis. + +
+
+
+
+
+ + ☆ A Siamese-based Verification System for Open-set Architecture + Attribution of Synthetic Images + + +
+ Despite the wide variety of methods developed for synthetic image +attribution, most of them can only attribute images generated by models or +architectures included in the training set and do not work with unknown +architectures, hindering their applicability in real-world scenarios. In this +paper, we propose a verification framework that relies on a Siamese Network to +address the problem of open-set attribution of synthetic images to the +architecture that generated them. We consider two different settings. In the +first setting, the system determines whether two images have been produced by +the same generative architecture or not. In the second setting, the system +verifies a claim about the architecture used to generate a synthetic image, +utilizing one or multiple reference images generated by the claimed +architecture. The main strength of the proposed system is its ability to +operate in both closed and open-set scenarios so that the input images, either +the query and reference images, can belong to the architectures considered +during training or not. Experimental evaluations encompassing various +generative architectures such as GANs, diffusion models, and transformers, +focusing on synthetic face image generation, confirm the excellent performance +of our method in both closed and open-set settings, as well as its strong +generalization capabilities. + +
+
+
+
+
+ + ☆ Hierarchical Semantic Perceptual Listener Head Video Generation: A + High-performance Pipeline ACM MM 2023 + + +
+ In dyadic speaker-listener interactions, the listener's head reactions along +with the speaker's head movements, constitute an important non-verbal semantic +expression together. The listener Head generation task aims to synthesize +responsive listener's head videos based on audios of the speaker and reference +images of the listener. Compared to the Talking-head generation, it is more +challenging to capture the correlation clues from the speaker's audio and +visual information. Following the ViCo baseline scheme, we propose a +high-performance solution by enhancing the hierarchical semantic extraction +capability of the audio encoder module and improving the decoder part, renderer +and post-processing modules. Our solution gets the first place on the official +leaderboard for the track of listening head generation. This paper is a +technical report of ViCo@2023 Conversational Head Generation Challenge in ACM +Multimedia 2023 conference. + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ☆ Deep unrolling Shrinkage Network for Dynamic MR imaging + + +
+ Deep unrolling networks that utilize sparsity priors have achieved great +success in dynamic magnetic resonance (MR) imaging. The convolutional neural +network (CNN) is usually utilized to extract the transformed domain, and then +the soft thresholding (ST) operator is applied to the CNN-transformed data to +enforce the sparsity priors. However, the ST operator is usually constrained to +be the same across all channels of the CNN-transformed data. In this paper, we +propose a novel operator, called soft thresholding with channel attention +(AST), that learns the threshold for each channel. In particular, we put +forward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the +alternating direction method of multipliers (ADMM) for optimizing the +transformed $l_1$ norm dynamic MR reconstruction model. Experimental results on +an open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net +outperforms the state-of-the-art methods. The source code is available at +\url{https://github.com/yhao-z/DUS-Net}. + +
+
+ comment: 5 pages,3 figures,2 tables +
+
+
+
+
+ + ☆ LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network + + +
+ Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent +blur is a challenging task.~Existing blur map-based deblurring methods have +demonstrated promising results. In this paper, we propose, to the best of our +knowledge, the first framework to introduce the contrastive language-image +pre-training framework (CLIP) to achieve accurate blur map estimation from DP +pairs unsupervisedly. To this end, we first carefully design text prompts to +enable CLIP to understand blur-related geometric prior knowledge from the DP +pair. Then, we propose a format to input stereo DP pair to the CLIP without any +fine-tuning, where the CLIP is pre-trained on monocular images. Given the +estimated blur map, we introduce a blur-prior attention block, a blur-weighting +loss and a blur-aware loss to recover the all-in-focus image. Our method +achieves state-of-the-art performance in extensive experiments. + +
+
+
+
+
+ + ☆ GenKL: An Iterative Framework for Resolving Label Ambiguity and Label + Non-conformity in Web Images Via a New Generalized KL Divergence + + +
+ Web image datasets curated online inherently contain ambiguous +in-distribution (ID) instances and out-of-distribution (OOD) instances, which +we collectively call non-conforming (NC) instances. In many recent approaches +for mitigating the negative effects of NC instances, the core implicit +assumption is that the NC instances can be found via entropy maximization. For +"entropy" to be well-defined, we are interpreting the output prediction vector +of an instance as the parameter vector of a multinomial random variable, with +respect to some trained model with a softmax output layer. Hence, entropy +maximization is based on the idealized assumption that NC instances have +predictions that are "almost" uniformly distributed. However, in real-world web +image datasets, there are numerous NC instances whose predictions are far from +being uniformly distributed. To tackle the limitation of entropy maximization, +we propose $(\alpha, \beta)$-generalized KL divergence, +$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$, which can be used to identify +significantly more NC instances. Theoretical properties of +$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$ are proven, and we also show +empirically that a simple use of $\mathcal{D}_{\text{KL}}^{\alpha, +\beta}(p\|q)$ outperforms all baselines on the NC instance identification task. +Building upon $(\alpha,\beta)$-generalized KL divergence, we also introduce a +new iterative training framework, GenKL, that identifies and relabels NC +instances. When evaluated on three web image datasets, Clothing1M, +Food101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art +classification accuracies: $81.34\%$, $85.73\%$ and $78.99\%$/$92.54\%$ +(top-1/top-5), respectively. + +
+
+ comment: Published (with open access) at International Journal of Computer + Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at: + https://github.com/codetopaper/GenKL +
+
+
+
+
+ + ☆ Fix your downsampling ASAP! Be natively more robust via Aliasing and + Spectral Artifact free Pooling + + +
+ Convolutional neural networks encode images through a sequence of +convolutions, normalizations and non-linearities as well as downsampling +operations into potentially strong semantic embeddings. Yet, previous work +showed that even slight mistakes during sampling, leading to aliasing, can be +directly attributed to the networks' lack in robustness. To address such issues +and facilitate simpler and faster adversarial training, [12] recently proposed +FLC pooling, a method for provably alias-free downsampling - in theory. In this +work, we conduct a further analysis through the lens of signal processing and +find that such current pooling methods, which address aliasing in the frequency +domain, are still prone to spectral leakage artifacts. Hence, we propose +aliasing and spectral artifact-free pooling, short ASAP. While only introducing +a few modifications to FLC pooling, networks using ASAP as downsampling method +exhibit higher native robustness against common corruptions, a property that +FLC pooling was missing. ASAP also increases native robustness against +adversarial attacks on high and low resolution data while maintaining similar +clean accuracy or even outperforming the baseline. + +
+
+
+
+
+ + ☆ From West to East: Who can understand the music of the others better? + + +
+ Recent developments in MIR have led to several benchmark deep learning models +whose embeddings can be used for a variety of downstream tasks. At the same +time, the vast majority of these models have been trained on Western pop/rock +music and related styles. This leads to research questions on whether these +models can be used to learn representations for different music cultures and +styles, or whether we can build similar music audio embedding models trained on +data from different cultures or styles. To that end, we leverage transfer +learning methods to derive insights about the similarities between the +different music cultures to which the data belongs to. We use two Western music +datasets, two traditional/folk datasets coming from eastern Mediterranean +cultures, and two datasets belonging to Indian art music. Three deep audio +embedding models are trained and transferred across domains, including two +CNN-based and a Transformer-based architecture, to perform auto-tagging for +each target domain dataset. Experimental results show that competitive +performance is achieved in all domains via transfer learning, while the best +source dataset varies for each music culture. The implementation and the +trained models are both provided in a public repository. + +
+
+
+
+
+ + ☆ DiffDP: Radiotherapy Dose Prediction via a Diffusion Model MICCAI 2023 + + +
+ Currently, deep learning (DL) has achieved the automatic prediction of dose +distribution in radiotherapy planning, enhancing its efficiency and quality. +However, existing methods suffer from the over-smoothing problem for their +commonly used L_1 or L_2 loss with posterior average calculations. To alleviate +this limitation, we innovatively introduce a diffusion-based dose prediction +(DiffDP) model for predicting the radiotherapy dose distribution of cancer +patients. Specifically, the DiffDP model contains a forward process and a +reverse process. In the forward process, DiffDP gradually transforms dose +distribution maps into Gaussian noise by adding small noise and trains a noise +predictor to predict the noise added in each timestep. In the reverse process, +it removes the noise from the original Gaussian noise in multiple steps with +the well-trained noise predictor and finally outputs the predicted dose +distribution map. To ensure the accuracy of the prediction, we further design a +structure encoder to extract anatomical information from patient anatomy images +and enable the noise predictor to be aware of the dose constraints within +several essential organs, i.e., the planning target volume and organs at risk. +Extensive experiments on an in-house dataset with 130 rectum cancer patients +demonstrate the s + +
+
+ comment: to be published in MICCAI 2023 +
+
+
+
+
+ + ☆ Density-invariant Features for Distant Point Cloud Registration ICCV + + +
+ Registration of distant outdoor LiDAR point clouds is crucial to extending +the 3D vision of collaborative autonomous vehicles, and yet is challenging due +to small overlapping area and a huge disparity between observed point +densities. In this paper, we propose Group-wise Contrastive Learning (GCL) +scheme to extract density-invariant geometric features to register distant +outdoor LiDAR point clouds. We mark through theoretical analysis and +experiments that, contrastive positives should be independent and identically +distributed (i.i.d.), in order to train densityinvariant feature extractors. We +propose upon the conclusion a simple yet effective training scheme to force the +feature of multiple point clouds in the same spatial location (referred to as +positive groups) to be similar, which naturally avoids the sampling bias +introduced by a pair of point clouds to conform with the i.i.d. principle. The +resulting fully-convolutional feature extractor is more powerful and +density-invariant than state-of-the-art methods, improving the registration +recall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and +26.9%, respectively. The code will be open-sourced. + +
+
+ comment: In Proceedings of the IEEE/CVF International Conference on Computer + Vision (ICCV), 2023 +
+
+
+
+
+ + ☆ DVPT: Dynamic Visual Prompt Tuning of Large Pre-trained Models for + Medical Image Analysis + + +
+ Limited labeled data makes it hard to train models from scratch in medical +domain, and an important paradigm is pre-training and then fine-tuning. Large +pre-trained models contain rich representations, which can be adapted to +downstream medical tasks. However, existing methods either tune all the +parameters or the task-specific layers of the pre-trained models, ignoring the +input variations of medical images, and thus they are not efficient or +effective. In this work, we aim to study parameter-efficient fine-tuning (PEFT) +for medical image analysis, and propose a dynamic visual prompt tuning method, +named DVPT. It can extract knowledge beneficial to downstream tasks from large +models with a few trainable parameters. Firstly, the frozen features are +transformed by an lightweight bottleneck layer to learn the domain-specific +distribution of downstream medical tasks, and then a few learnable visual +prompts are used as dynamic queries and then conduct cross-attention with the +transformed features, attempting to acquire sample-specific knowledge that are +suitable for each sample. Finally, the features are projected to original +feature dimension and aggregated with the frozen features. This DVPT module can +be shared between different Transformer layers, further reducing the trainable +parameters. To validate DVPT, we conduct extensive experiments with different +pre-trained models on medical classification and segmentation tasks. We find +such PEFT method can not only efficiently adapt the pre-trained models to the +medical domain, but also brings data efficiency with partial labeled data. For +example, with 0.5\% extra trainable parameters, our method not only outperforms +state-of-the-art PEFT methods, even surpasses the full fine-tuning by more than +2.20\% Kappa score on medical classification task. It can saves up to 60\% +labeled data and 99\% storage cost of ViT-B/16. + +
+
+
+
+
+ + ☆ Text2Layer: Layered Image Generation using Latent Diffusion Model + + +
+ Layer compositing is one of the most popular image editing workflows among +both amateurs and professionals. Motivated by the success of diffusion models, +we explore layer compositing from a layered image generation perspective. +Instead of generating an image, we propose to generate background, foreground, +layer mask, and the composed image simultaneously. To achieve layered image +generation, we train an autoencoder that is able to reconstruct layered images +and train diffusion models on the latent representation. One benefit of the +proposed problem is to enable better compositing workflows in addition to the +high-quality image output. Another benefit is producing higher-quality layer +masks compared to masks produced by a separate step of image segmentation. +Experimental results show that the proposed method is able to generate +high-quality layered images and initiates a benchmark for future work. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ Source-Free Domain Adaptation for Medical Image Segmentation via + Prototype-Anchored Feature Alignment and Contrastive Learning MICCAI23 + + +
+ Unsupervised domain adaptation (UDA) has increasingly gained interests for +its capacity to transfer the knowledge learned from a labeled source domain to +an unlabeled target domain. However, typical UDA methods require concurrent +access to both the source and target domain data, which largely limits its +application in medical scenarios where source data is often unavailable due to +privacy concern. To tackle the source data-absent problem, we present a novel +two-stage source-free domain adaptation (SFDA) framework for medical image +segmentation, where only a well-trained source segmentation model and unlabeled +target data are available during domain adaptation. Specifically, in the +prototype-anchored feature alignment stage, we first utilize the weights of the +pre-trained pixel-wise classifier as source prototypes, which preserve the +information of source features. Then, we introduce the bi-directional transport +to align the target features with class prototypes by minimizing its expected +cost. On top of that, a contrastive learning stage is further devised to +utilize those pixels with unreliable predictions for a more compact target +feature distribution. Extensive experiments on a cross-modality medical +segmentation task demonstrate the superiority of our method in large domain +discrepancy settings compared with the state-of-the-art SFDA approaches and +even some UDA methods. Code is available at +https://github.com/CSCYQJ/MICCAI23-ProtoContra-SFDA. + +
+
+ comment: Accepted by MICCAI23 +
+
+
+
+
+ + ☆ Towards Building More Robust Models with Frequency Bias ICCV23 + + +
+ The vulnerability of deep neural networks to adversarial samples has been a +major impediment to their broad applications, despite their success in various +fields. Recently, some works suggested that adversarially-trained models +emphasize the importance of low-frequency information to achieve higher +robustness. While several attempts have been made to leverage this frequency +characteristic, they have all faced the issue that applying low-pass filters +directly to input images leads to irreversible loss of discriminative +information and poor generalizability to datasets with distinct frequency +features. This paper presents a plug-and-play module called the Frequency +Preference Control Module that adaptively reconfigures the low- and +high-frequency components of intermediate feature representations, providing +better utilization of frequency in robust learning. Empirical studies show that +our proposed module can be easily incorporated into any adversarial training +framework, further improving model robustness across different architectures +and datasets. Additionally, experiments were conducted to examine how the +frequency bias of robust models impacts the adversarial training process and +its final robustness, revealing interesting insights. + +
+
+ comment: Accepted by ICCV23 +
+
+
+
+
+ + ☆ Longitudinal Data and a Semantic Similarity Reward for Chest X-Ray + Report Generation + + +
+ Chest X-Ray (CXR) report generation is a promising approach to improving the +efficiency of CXR interpretation. However, a significant increase in diagnostic +accuracy is required before that can be realised. Motivated by this, we propose +a framework that is more inline with a radiologist's workflow by considering +longitudinal data. Here, the decoder is additionally conditioned on the report +from the subject's previous imaging study via a prompt. We also propose a new +reward for reinforcement learning based on CXR-BERT, which computes the +similarity between reports. We conduct experiments on the MIMIC-CXR dataset. +The results indicate that longitudinal data improves CXR report generation. +CXR-BERT is also shown to be a promising alternative to the current +state-of-the-art reward based on RadGraph. This investigation indicates that +longitudinal CXR report generation can offer a substantial increase in +diagnostic accuracy. Our Hugging Face model is available at: +https://huggingface.co/aehrc/cxrmate and code is available at: +https://github.com/aehrc/cxrmate. + +
+
+
+
+
+ + ☆ Generative Prompt Model for Weakly Supervised Object Localization + + +
+ Weakly supervised object localization (WSOL) remains challenging when +learning object localization models from image category labels. Conventional +methods that discriminatively train activation models ignore representative yet +less discriminative object parts. In this study, we propose a generative prompt +model (GenPromp), defining the first generative pipeline to localize less +discriminative object parts by formulating WSOL as a conditional image +denoising procedure. During training, GenPromp converts image category labels +to learnable prompt embeddings which are fed to a generative model to +conditionally recover the input image with noise and learn representative +embeddings. During inference, enPromp combines the representative embeddings +with discriminative embeddings (queried from an off-the-shelf vision-language +model) for both representative and discriminative capacity. The combined +embeddings are finally used to generate multi-scale high-quality attention +maps, which facilitate localizing full object extent. Experiments on +CUB-200-2011 and ILSVRC show that GenPromp respectively outperforms the best +discriminative models by 5.2% and 5.6% (Top-1 Loc), setting a solid baseline +for WSOL with the generative model. Code is available at +https://github.com/callsys/GenPromp. + +
+
+
+
+
+ + ☆ Space Engage: Collaborative Space Supervision for Contrastive-based + Semi-Supervised Semantic Segmentation ICCV 2023 + + +
+ Semi-Supervised Semantic Segmentation (S4) aims to train a segmentation model +with limited labeled images and a substantial volume of unlabeled images. To +improve the robustness of representations, powerful methods introduce a +pixel-wise contrastive learning approach in latent space (i.e., representation +space) that aggregates the representations to their prototypes in a fully +supervised manner. However, previous contrastive-based S4 methods merely rely +on the supervision from the model's output (logits) in logit space during +unlabeled training. In contrast, we utilize the outputs in both logit space and +representation space to obtain supervision in a collaborative way. The +supervision from two spaces plays two roles: 1) reduces the risk of +over-fitting to incorrect semantic information in logits with the help of +representations; 2) enhances the knowledge exchange between the two spaces. +Furthermore, unlike previous approaches, we use the similarity between +representations and prototypes as a new indicator to tilt training those +under-performing representations and achieve a more efficient contrastive +learning process. Results on two public benchmarks demonstrate the competitive +performance of our method compared with state-of-the-art methods. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Towards Robust Scene Text Image Super-resolution via Explicit Location + Enhancement IJCAI2023 + + +
+ Scene text image super-resolution (STISR), aiming to improve image quality +while boosting downstream scene text recognition accuracy, has recently +achieved great success. However, most existing methods treat the foreground +(character regions) and background (non-character regions) equally in the +forward process, and neglect the disturbance from the complex background, thus +limiting the performance. To address these issues, in this paper, we propose a +novel method LEMMA that explicitly models character regions to produce +high-level text-specific guidance for super-resolution. To model the location +of characters effectively, we propose the location enhancement module to +extract character region features based on the attention map sequence. Besides, +we propose the multi-modal alignment module to perform bidirectional +visual-semantic alignment to generate high-quality prior guidance, which is +then incorporated into the super-resolution branch in an adaptive manner using +the proposed adaptive fusion module. Experiments on TextZoom and four scene +text recognition benchmarks demonstrate the superiority of our method over +other state-of-the-art methods. Code is available at +https://github.com/csguoh/LEMMA. + +
+
+ comment: Accepted as IJCAI2023 paper +
+
+
+
+
+ + ☆ Watch out Venomous Snake Species: A Solution to SnakeCLEF2023 + + +
+ The SnakeCLEF2023 competition aims to the development of advanced algorithms +for snake species identification through the analysis of images and +accompanying metadata. This paper presents a method leveraging utilization of +both images and metadata. Modern CNN models and strong data augmentation are +utilized to learn better representation of images. To relieve the challenge of +long-tailed distribution, seesaw loss is utilized in our method. We also design +a light model to calculate prior probabilities using metadata features +extracted from CLIP in post processing stage. Besides, we attach more +importance to venomous species by assigning venomous species labels to some +examples that model is uncertain about. Our method achieves 91.31% score of the +final metric combined of F1 and other metrics on private leaderboard, which is +the 1st place among the participators. The code is available at +https://github.com/xiaoxsparraw/CLEF2023. + +
+
+ comment: This work was the winner solution of the SnakeCLEF2023 challenge +
+
+
+
+
+ + ☆ Improved Distribution Matching for Dataset Condensation CVPR2023 + + +
+ Dataset Condensation aims to condense a large dataset into a smaller one +while maintaining its ability to train a well-performing model, thus reducing +the storage cost and training effort in deep learning applications. However, +conventional dataset condensation methods are optimization-oriented and +condense the dataset by performing gradient or parameter matching during model +optimization, which is computationally intensive even on small datasets and +models. In this paper, we propose a novel dataset condensation method based on +distribution matching, which is more efficient and promising. Specifically, we +identify two important shortcomings of naive distribution matching (i.e., +imbalanced feature numbers and unvalidated embeddings for distance computation) +and address them with three novel techniques (i.e., partitioning and expansion +augmentation, efficient and enriched model sampling, and class-aware +distribution regularization). Our simple yet effective method outperforms most +previous optimization-oriented methods with much fewer computational resources, +thereby scaling data condensation to larger datasets and models. Extensive +experiments demonstrate the effectiveness of our method. Codes are available at +https://github.com/uitrbn/IDM + +
+
+ comment: CVPR2023 +
+
+
+
+
+ + ☆ ClickSeg: 3D Instance Segmentation with Click-Level Weak Annotations + + +
+ 3D instance segmentation methods often require fully-annotated dense labels +for training, which are costly to obtain. In this paper, we present ClickSeg, a +novel click-level weakly supervised 3D instance segmentation method that +requires one point per instance annotation merely. Such a problem is very +challenging due to the extremely limited labels, which has rarely been solved +before. We first develop a baseline weakly-supervised training method, which +generates pseudo labels for unlabeled data by the model itself. To utilize the +property of click-level annotation setting, we further propose a new training +framework. Instead of directly using the model inference way, i.e., mean-shift +clustering, to generate the pseudo labels, we propose to use k-means with fixed +initial seeds: the annotated points. New similarity metrics are further +designed for clustering. Experiments on ScanNetV2 and S3DIS datasets show that +the proposed ClickSeg surpasses the previous best weakly supervised instance +segmentation result by a large margin (e.g., +9.4% mAP on ScanNetV2). Using +0.02% supervision signals merely, ClickSeg achieves $\sim$90% of the accuracy +of the fully-supervised counterpart. Meanwhile, it also achieves +state-of-the-art semantic segmentation results among weakly supervised methods +that use the same annotation settings. + +
+
+
+
+
+ + ☆ NTIRE 2023 Quality Assessment of Video Enhancement Challenge + + +
+ This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement +Challenge, which will be held in conjunction with the New Trends in Image +Restoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to +address a major challenge in the field of video processing, namely, video +quality assessment (VQA) for enhanced videos. The challenge uses the VQA +Dataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211 +enhanced videos, including 600 videos with color, brightness, and contrast +enhancements, 310 videos with deblurring, and 301 deshaked videos. The +challenge has a total of 167 registered participants. 61 participating teams +submitted their prediction results during the development phase, with a total +of 3168 submissions. A total of 176 submissions were submitted by 37 +participating teams during the final testing phase. Finally, 19 participating +teams submitted their models and fact sheets, and detailed the methods they +used. Some methods have achieved better results than baseline methods, and the +winning methods have demonstrated superior prediction performance. + +
+
+
+
+
+ + ☆ Uncertainty-Driven Multi-Scale Feature Fusion Network for Real-time + Image Deraining + + +
+ Visual-based measurement systems are frequently affected by rainy weather due +to the degradation caused by rain streaks in captured images, and existing +imaging devices struggle to address this issue in real-time. While most efforts +leverage deep networks for image deraining and have made progress, their large +parameter sizes hinder deployment on resource-constrained devices. +Additionally, these data-driven models often produce deterministic results, +without considering their inherent epistemic uncertainty, which can lead to +undesired reconstruction errors. Well-calibrated uncertainty can help alleviate +prediction errors and assist measurement devices in mitigating risks and +improving usability. Therefore, we propose an Uncertainty-Driven Multi-Scale +Feature Fusion Network (UMFFNet) that learns the probability mapping +distribution between paired images to estimate uncertainty. Specifically, we +introduce an uncertainty feature fusion block (UFFB) that utilizes uncertainty +information to dynamically enhance acquired features and focus on blurry +regions obscured by rain streaks, reducing prediction errors. In addition, to +further boost the performance of UMFFNet, we fused feature information from +multiple scales to guide the network for efficient collaborative rain removal. +Extensive experiments demonstrate that UMFFNet achieves significant performance +improvements with few parameters, surpassing state-of-the-art image deraining +methods. + +
+
+
+
+
+ + ☆ SAMConvex: Fast Discrete Optimization for CT Registration using + Self-supervised Anatomical Embedding and Correlation Pyramid + + +
+ Estimating displacement vector field via a cost volume computed in the +feature space has shown great success in image registration, but it suffers +excessive computation burdens. Moreover, existing feature descriptors only +extract local features incapable of representing the global semantic +information, which is especially important for solving large transformations. +To address the discussed issues, we propose SAMConvex, a fast coarse-to-fine +discrete optimization method for CT registration that includes a decoupled +convex optimization procedure to obtain deformation fields based on a +self-supervised anatomical embedding (SAM) feature extractor that captures both +local and global information. To be specific, SAMConvex extracts per-voxel +features and builds 6D correlation volumes based on SAM features, and +iteratively updates a flow field by performing lookups on the correlation +volumes with a coarse-to-fine scheme. SAMConvex outperforms the +state-of-the-art learning-based methods and optimization-based methods over two +inter-patient registration datasets (Abdomen CT and HeadNeck CT) and one +intra-patient registration dataset (Lung CT). Moreover, as an +optimization-based method, SAMConvex only takes $\sim2$s ($\sim5s$ with +instance optimization) for one paired images. + +
+
+
+
+
+ + ☆ AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks ICCV 2023 + + +
+ To deliver the artistic expression of the target style, recent studies +exploit the attention mechanism owing to its ability to map the local patches +of the style image to the corresponding patches of the content image. However, +because of the low semantic correspondence between arbitrary content and +artworks, the attention module repeatedly abuses specific local patches from +the style image, resulting in disharmonious and evident repetitive artifacts. +To overcome this limitation and accomplish impeccable artistic style transfer, +we focus on enhancing the attention mechanism and capturing the rhythm of +patterns that organize the style. In this paper, we introduce a novel metric, +namely pattern repeatability, that quantifies the repetition of patterns in the +style image. Based on the pattern repeatability, we propose Aesthetic +Pattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot +of local and global style expressions. In addition, we propose a novel +self-supervisory task to encourage the attention mechanism to learn precise and +meaningful semantic correspondence. Lastly, we introduce the patch-wise style +loss to transfer the elaborate rhythm of local patterns. Through qualitative +and quantitative evaluations, we verify the reliability of the proposed pattern +repeatability that aligns with human perception, and demonstrate the +superiority of the proposed framework. + +
+
+ comment: Accepted by ICCV 2023. Code is available at this + https://github.com/Kibeom-Hong/AesPA-Net +
+
+
+
+
+ + ☆ Multi-Grained Multimodal Interaction Network for Entity Linking KDD 2023 + + +
+ Multimodal entity linking (MEL) task, which aims at resolving ambiguous +mentions to a multimodal knowledge graph, has attracted wide attention in +recent years. Though large efforts have been made to explore the complementary +effect among multiple modalities, however, they may fail to fully absorb the +comprehensive expression of abbreviated textual context and implicit visual +indication. Even worse, the inevitable noisy data may cause inconsistency of +different modalities during the learning process, which severely degenerates +the performance. To address the above issues, in this paper, we propose a novel +Multi-GraIned Multimodal InteraCtion Network $\textbf{(MIMIC)}$ framework for +solving the MEL task. Specifically, the unified inputs of mentions and entities +are first encoded by textual/visual encoders separately, to extract global +descriptive features and local detailed features. Then, to derive the +similarity matching score for each mention-entity pair, we device three +interaction units to comprehensively explore the intra-modal interaction and +inter-modal fusion among features of entities and mentions. In particular, +three modules, namely the Text-based Global-Local interaction Unit (TGLU), +Vision-based DuaL interaction Unit (VDLU) and Cross-Modal Fusion-based +interaction Unit (CMFU) are designed to capture and integrate the fine-grained +representation lying in abbreviated text and implicit visual cues. Afterwards, +we introduce a unit-consistency objective function via contrastive learning to +avoid inconsistency and model degradation. Experimental results on three public +benchmark datasets demonstrate that our solution outperforms various +state-of-the-art baselines, and ablation studies verify the effectiveness of +designed modules. + +
+
+ comment: Accepted by KDD 2023 +
+
+
+
+
+ + ☆ Semantic-Aware Dual Contrastive Learning for Multi-label Image + Classification ECAI 23 + + +
+ Extracting image semantics effectively and assigning corresponding labels to +multiple objects or attributes for natural images is challenging due to the +complex scene contents and confusing label dependencies. Recent works have +focused on modeling label relationships with graph and understanding object +regions using class activation maps (CAM). However, these methods ignore the +complex intra- and inter-category relationships among specific semantic +features, and CAM is prone to generate noisy information. To this end, we +propose a novel semantic-aware dual contrastive learning framework that +incorporates sample-to-sample contrastive learning (SSCL) as well as +prototype-to-sample contrastive learning (PSCL). Specifically, we leverage +semantic-aware representation learning to extract category-related local +discriminative features and construct category prototypes. Then based on SSCL, +label-level visual representations of the same category are aggregated +together, and features belonging to distinct categories are separated. +Meanwhile, we construct a novel PSCL module to narrow the distance between +positive samples and category prototypes and push negative samples away from +the corresponding category prototypes. Finally, the discriminative label-level +features related to the image content are accurately captured by the joint +training of the above three parts. Experiments on five challenging large-scale +public datasets demonstrate that our proposed method is effective and +outperforms the state-of-the-art methods. Code and supplementary materials are +released on https://github.com/yu-gi-oh-leilei/SADCL. + +
+
+ comment: 8 pages, 6 figures, accepted by ECAI 23 +
+
+
+
+
+ + ☆ Towards Saner Deep Image Registration ICCV 2023 + + +
+ With recent advances in computing hardware and surges of deep-learning +architectures, learning-based deep image registration methods have surpassed +their traditional counterparts, in terms of metric performance and inference +time. However, these methods focus on improving performance measurements such +as Dice, resulting in less attention given to model behaviors that are equally +desirable for registrations, especially for medical imaging. This paper +investigates these behaviors for popular learning-based deep registrations +under a sanity-checking microscope. We find that most existing registrations +suffer from low inverse consistency and nondiscrimination of identical pairs +due to overly optimized image similarities. To rectify these behaviors, we +propose a novel regularization-based sanity-enforcer method that imposes two +sanity checks on the deep model to reduce its inverse consistency errors and +increase its discriminative power simultaneously. Moreover, we derive a set of +theoretical guarantees for our sanity-checked image registration method, with +experimental results supporting our theoretical findings and their +effectiveness in increasing the sanity of models without sacrificing any +performance. Our code and models are available at +\url{https://github.com/tuffr5/Saner-deep-registration}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ GlobalMapper: Arbitrary-Shaped Urban Layout Generation ICCV 2023 + + +
+ Modeling and designing urban building layouts is of significant interest in +computer vision, computer graphics, and urban applications. A building layout +consists of a set of buildings in city blocks defined by a network of roads. We +observe that building layouts are discrete structures, consisting of multiple +rows of buildings of various shapes, and are amenable to skeletonization for +mapping arbitrary city block shapes to a canonical form. Hence, we propose a +fully automatic approach to building layout generation using graph attention +networks. Our method generates realistic urban layouts given arbitrary road +networks, and enables conditional generation based on learned priors. Our +results, including user study, demonstrate superior performance as compared to +prior layout generation networks, support arbitrary city block and varying +building shapes as demonstrated by generating layouts for 28 large cities. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Eye Disease Classification Using Deep Learning Techniques + + +
+ Eye is the essential sense organ for vision function. Due to the fact that +certain eye disorders might result in vision loss, it is essential to diagnose +and treat eye diseases early on. By identifying common eye illnesses and +performing an eye check, eye care providers can safeguard patients against +vision loss or blindness. Convolutional neural networks (CNN) and transfer +learning were employed in this study to discriminate between a normal eye and +one with diabetic retinopathy, cataract, or glaucoma disease. Using transfer +learning for multi-class classification, high accuracy was achieved at 94% +while the traditional CNN achieved 84% rate. + +
+
+
+
+
+ + ☆ Mining Conditional Part Semantics with Occluded Extrapolation for + Human-Object Interaction Detection + + +
+ Human-Object Interaction Detection is a crucial aspect of human-centric scene +understanding, with important applications in various domains. Despite recent +progress in this field, recognizing subtle and detailed interactions remains +challenging. Existing methods try to use human-related clues to alleviate the +difficulty, but rely heavily on external annotations or knowledge, limiting +their practical applicability in real-world scenarios. In this work, we propose +a novel Part Semantic Network (PSN) to solve this problem. The core of PSN is a +Conditional Part Attention (CPA) mechanism, where human features are taken as +keys and values, and the object feature is used as query for the computation in +a cross-attention mechanism. In this way, our model learns to automatically +focus on the most informative human parts conditioned on the involved object, +generating more semantically meaningful features for interaction recognition. +Additionally, we propose an Occluded Part Extrapolation (OPE) strategy to +facilitate interaction recognition under occluded scenarios, which teaches the +model to extrapolate detailed features from partially occluded ones. Our method +consistently outperforms prior approaches on the V-COCO and HICO-DET datasets, +without external data or extra annotations. Additional ablation studies +validate the effectiveness of each component of our proposed method. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Novel Batch Active Learning Approach and Its Application to Synthetic + Aperture Radar Datasets + + +
+ Active learning improves the performance of machine learning methods by +judiciously selecting a limited number of unlabeled data points to query for +labels, with the aim of maximally improving the underlying classifier's +performance. Recent gains have been made using sequential active learning for +synthetic aperture radar (SAR) data arXiv:2204.00005. In each iteration, +sequential active learning selects a query set of size one while batch active +learning selects a query set of multiple datapoints. While batch active +learning methods exhibit greater efficiency, the challenge lies in maintaining +model accuracy relative to sequential active learning methods. We developed a +novel, two-part approach for batch active learning: Dijkstra's Annulus Core-Set +(DAC) for core-set generation and LocalMax for batch sampling. The batch active +learning process that combines DAC and LocalMax achieves nearly identical +accuracy as sequential active learning but is more efficient, proportional to +the batch size. As an application, a pipeline is built based on transfer +learning feature embedding, graph learning, DAC, and LocalMax to classify the +FUSAR-Ship and OpenSARShip datasets. Our pipeline outperforms the +state-of-the-art CNN-based methods. + +
+
+ comment: 16 pages, 7 figures, Preprint +
+
+
+
+
+ + ☆ Backdoor Attack against Object Detection with Clean Annotation + + +
+ Deep neural networks (DNNs) have shown unprecedented success in object +detection tasks. However, it was also discovered that DNNs are vulnerable to +multiple kinds of attacks, including Backdoor Attacks. Through the attack, the +attacker manages to embed a hidden backdoor into the DNN such that the model +behaves normally on benign data samples, but makes attacker-specified judgments +given the occurrence of a predefined trigger. Although numerous backdoor +attacks have been experimented on image classification, backdoor attacks on +object detection tasks have not been properly investigated and explored. As +object detection has been adopted as an important module in multiple +security-sensitive applications such as autonomous driving, backdoor attacks on +object detection could pose even more severe threats. Inspired by the inherent +property of deep learning-based object detectors, we propose a simple yet +effective backdoor attack method against object detection without modifying the +ground truth annotations, specifically focusing on the object disappearance +attack and object generation attack. Extensive experiments and ablation studies +prove the effectiveness of our attack on two benchmark object detection +datasets, PASCAL VOC07+12 and MSCOCO, on which we achieve an attack success +rate of more than 92% with a poison rate of only 5%. + +
+
+
+
+
+ + ☆ Findings of Factify 2: Multimodal Fake News Detection AAAI 2023 + + +
+ With social media usage growing exponentially in the past few years, fake +news has also become extremely prevalent. The detrimental impact of fake news +emphasizes the need for research focused on automating the detection of false +information and verifying its accuracy. In this work, we present the outcome of +the Factify 2 shared task, which provides a multi-modal fact verification and +satire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data +calls for a comparison based approach to the task by pairing social media +claims with supporting documents, with both text and image, divided into 5 +classes based on multi-modal relations. In the second iteration of this task we +had over 60 participants and 9 final test-set submissions. The best +performances came from the use of DeBERTa for text and Swinv2 and CLIP for +image. The highest F1 score averaged for all five classes was 81.82%. + +
+
+ comment: Defactify2 @AAAI 2023 +
+
+
+
+
+ + ☆ Classification of Visualization Types and Perspectives in Patents + + +
+ Due to the swift growth of patent applications each year, information and +multimedia retrieval approaches that facilitate patent exploration and +retrieval are of utmost importance. Different types of visualizations (e.g., +graphs, technical drawings) and perspectives (e.g., side view, perspective) are +used to visualize details of innovations in patents. The classification of +these images enables a more efficient search and allows for further analysis. +So far, datasets for image type classification miss some important +visualization types for patents. Furthermore, related work does not make use of +recent deep learning approaches including transformers. In this paper, we adopt +state-of-the-art deep learning methods for the classification of visualization +types and perspectives in patent images. We extend the CLEF-IP dataset for +image type classification in patents to ten classes and provide manual ground +truth annotations. In addition, we derive a set of hierarchical classes from a +dataset that provides weakly-labeled data for image perspectives. Experimental +results have demonstrated the feasibility of the proposed approaches. Source +code, models, and dataset will be made publicly available. + +
+
+ comment: Accepted in International Conference on Theory and Practice of + Digital Libraries (TPDL) 2023 (They have the copyright to publish + camera-ready version of this work) +
+
+
+
+
+ + ☆ A Step Towards Worldwide Biodiversity Assessment: The BIOSCAN-1M Insect + Dataset + + +
+ In an effort to catalog insect biodiversity, we propose a new large dataset +of hand-labelled insect images, the BIOSCAN-Insect Dataset. Each record is +taxonomically classified by an expert, and also has associated genetic +information including raw nucleotide barcode sequences and assigned barcode +index numbers, which are genetically-based proxies for species classification. +This paper presents a curated million-image dataset, primarily to train +computer-vision models capable of providing image-based taxonomic assessment, +however, the dataset also presents compelling characteristics, the study of +which would be of interest to the broader machine learning community. Driven by +the biological nature inherent to the dataset, a characteristic long-tailed +class-imbalance distribution is exhibited. Furthermore, taxonomic labelling is +a hierarchical classification scheme, presenting a highly fine-grained +classification problem at lower levels. Beyond spurring interest in +biodiversity research within the machine learning community, progress on +creating an image-based taxonomic classifier will also further the ultimate +goal of all BIOSCAN research: to lay the foundation for a comprehensive survey +of global biodiversity. This paper introduces the dataset and explores the +classification task through the implementation and analysis of a baseline +classifier. + +
+
+
+
+
+ + ☆ Confidence Estimation Using Unlabeled Data ICLR'23 + + +
+ Overconfidence is a common issue for deep neural networks, limiting their +deployment in real-world applications. To better estimate confidence, existing +methods mostly focus on fully-supervised scenarios and rely on training labels. +In this paper, we propose the first confidence estimation method for a +semi-supervised setting, when most training labels are unavailable. We +stipulate that even with limited training labels, we can still reasonably +approximate the confidence of model on unlabeled samples by inspecting the +prediction consistency through the training process. We use training +consistency as a surrogate function and propose a consistency ranking loss for +confidence estimation. On both image classification and segmentation tasks, our +method achieves state-of-the-art performances in confidence estimation. +Furthermore, we show the benefit of the proposed method through a downstream +active learning task. The code is available at +https://github.com/TopoXLab/consistency-ranking-loss + +
+
+ comment: Accepted by ICLR'23 +
+
+
+
+
+ + ☆ PreDiff: Precipitation Nowcasting with Latent Diffusion Models + + +
+ Earth system forecasting has traditionally relied on complex physical models +that are computationally expensive and require significant domain expertise. In +the past decade, the unprecedented increase in spatiotemporal Earth observation +data has enabled data-driven forecasting models using deep learning techniques. +These models have shown promise for diverse Earth system forecasting tasks but +either struggle with handling uncertainty or neglect domain-specific prior +knowledge, resulting in averaging possible futures to blurred forecasts or +generating physically implausible predictions. To address these limitations, we +propose a two-stage pipeline for probabilistic spatiotemporal forecasting: 1) +We develop PreDiff, a conditional latent diffusion model capable of +probabilistic forecasts. 2) We incorporate an explicit knowledge control +mechanism to align forecasts with domain-specific physical constraints. This is +achieved by estimating the deviation from imposed constraints at each denoising +step and adjusting the transition distribution accordingly. We conduct +empirical studies on two datasets: N-body MNIST, a synthetic dataset with +chaotic behavior, and SEVIR, a real-world precipitation nowcasting dataset. +Specifically, we impose the law of conservation of energy in N-body MNIST and +anticipated precipitation intensity in SEVIR. Experiments demonstrate the +effectiveness of PreDiff in handling uncertainty, incorporating domain-specific +prior knowledge, and generating forecasts that exhibit high operational +utility. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Explaining Autonomous Driving Actions with Visual Question Answering SC-2023 + + +
+ The end-to-end learning ability of self-driving vehicles has achieved +significant milestones over the last decade owing to rapid advances in deep +learning and computer vision algorithms. However, as autonomous driving +technology is a safety-critical application of artificial intelligence (AI), +road accidents and established regulatory principles necessitate the need for +the explainability of intelligent action choices for self-driving vehicles. To +facilitate interpretability of decision-making in autonomous driving, we +present a Visual Question Answering (VQA) framework, which explains driving +actions with question-answering-based causal reasoning. To do so, we first +collect driving videos in a simulation environment using reinforcement learning +(RL) and extract consecutive frames from this log data uniformly for five +selected action categories. Further, we manually annotate the extracted frames +using question-answer pairs as justifications for the actions chosen in each +scenario. Finally, we evaluate the correctness of the VQA-predicted answers for +actions on unseen driving scenes. The empirical results suggest that the VQA +mechanism can provide support to interpret real-time decisions of autonomous +vehicles and help enhance overall driving safety. + +
+
+ comment: Accepted to the 2023 IEEE International Conference on Intelligent + Transportation Systems (IEEE ITSC-2023) +
+
+
+
+
+ + ☆ Interpreting and Correcting Medical Image Classification with PIP-Net + + +
+ Part-prototype models are explainable-by-design image classifiers, and a +promising alternative to black box AI. This paper explores the applicability +and potential of interpretable machine learning, in particular PIP-Net, for +automated diagnosis support on real-world medical imaging data. PIP-Net learns +human-understandable prototypical image parts and we evaluate its accuracy and +interpretability for fracture detection and skin cancer diagnosis. We find that +PIP-Net's decision making process is in line with medical classification +standards, while only provided with image-level class labels. Because of +PIP-Net's unsupervised pretraining of prototypes, data quality problems such as +undesired text in an X-ray or labelling errors can be easily identified. +Additionally, we are the first to show that humans can manually correct the +reasoning of PIP-Net by directly disabling undesired prototypes. We conclude +that part-prototype models are promising for medical applications due to their +interpretability and potential for advanced model debugging. + +
+
+
+
+
+ + ☆ POV-Surgery: A Dataset for Egocentric Hand and Tool Pose Estimation + During Surgical Activities + + +
+ The surgical usage of Mixed Reality (MR) has received growing attention in +areas such as surgical navigation systems, skill assessment, and robot-assisted +surgeries. For such applications, pose estimation for hand and surgical +instruments from an egocentric perspective is a fundamental task and has been +studied extensively in the computer vision field in recent years. However, the +development of this field has been impeded by a lack of datasets, especially in +the surgical field, where bloody gloves and reflective metallic tools make it +hard to obtain 3D pose annotations for hands and objects using conventional +methods. To address this issue, we propose POV-Surgery, a large-scale, +synthetic, egocentric dataset focusing on pose estimation for hands with +different surgical gloves and three orthopedic surgical instruments, namely +scalpel, friem, and diskplacer. Our dataset consists of 53 sequences and 88,329 +frames, featuring high-resolution RGB-D video streams with activity +annotations, accurate 3D and 2D annotations for hand-object pose, and 2D +hand-object segmentation masks. We fine-tune the current SOTA methods on +POV-Surgery and further show the generalizability when applying to real-life +cases with surgical gloves and tools by extensive evaluations. The code and the +dataset are publicly available at batfacewayne.github.io/POV_Surgery_io/. + +
+
+
+
+
+ + ☆ TokenFlow: Consistent Diffusion Features for Consistent Video Editing + + +
+ The generative AI revolution has recently expanded to videos. Nevertheless, +current state-of-the-art video models are still lagging behind image models in +terms of visual quality and user control over the generated content. In this +work, we present a framework that harnesses the power of a text-to-image +diffusion model for the task of text-driven video editing. Specifically, given +a source video and a target text-prompt, our method generates a high-quality +video that adheres to the target text, while preserving the spatial layout and +motion of the input video. Our method is based on a key observation that +consistency in the edited video can be obtained by enforcing consistency in the +diffusion feature space. We achieve this by explicitly propagating diffusion +features based on inter-frame correspondences, readily available in the model. +Thus, our framework does not require any training or fine-tuning, and can work +in conjunction with any off-the-shelf text-to-image editing method. We +demonstrate state-of-the-art editing results on a variety of real-world videos. +Webpage: https://diffusion-tokenflow.github.io/ + +
+
+
+
+
+ + ☆ Improving Multimodal Datasets with Image Captioning + + +
+ Massive web datasets play a key role in the success of large vision-language +models like CLIP and Flamingo. However, the raw web data is noisy, and existing +filtering methods to reduce noise often come at the expense of data diversity. +Our work focuses on caption quality as one major source of noise, and studies +how generated captions can increase the utility of web-scraped datapoints with +nondescript text. Through exploring different mixing strategies for raw and +generated captions, we outperform the best filtering method proposed by the +DataComp benchmark by 2% on ImageNet and 4% on average across 38 tasks, given a +candidate pool of 128M image-text pairs. Our best approach is also 2x better at +Flickr and MS-COCO retrieval. We then analyze what makes synthetic captions an +effective source of text supervision. In experimenting with different image +captioning models, we also demonstrate that the performance of a model on +standard image captioning benchmarks (e.g., NoCaps CIDEr) is not a reliable +indicator of the utility of the captions it generates for multimodal training. +Finally, our experiments with using generated captions at DataComp's large +scale (1.28B image-text pairs) offer insights into the limitations of synthetic +text, as well as the importance of image curation with increasing training data +quantity. + +
+
+
+
+
+ + ♻ ☆ Persistent Animal Identification Leveraging Non-Visual Markers + + +
+ Our objective is to locate and provide a unique identifier for each mouse in +a cluttered home-cage environment through time, as a precursor to automated +behaviour recognition for biological research. This is a very challenging +problem due to (i) the lack of distinguishing visual features for each mouse, +and (ii) the close confines of the scene with constant occlusion, making +standard visual tracking approaches unusable. However, a coarse estimate of +each mouse's location is available from a unique RFID implant, so there is the +potential to optimally combine information from (weak) tracking with coarse +information on identity. To achieve our objective, we make the following key +contributions: (a) the formulation of the object identification problem as an +assignment problem (solved using Integer Linear Programming), and (b) a novel +probabilistic model of the affinity between tracklets and RFID data. The latter +is a crucial part of the model, as it provides a principled probabilistic +treatment of object detections given coarse localisation. Our approach achieves +77% accuracy on this animal identification problem, and is able to reject +spurious detections when the animals are hidden. + +
+
+
+
+
+ + ♻ ☆ Evaluation of Complexity Measures for Deep Learning Generalization in + Medical Image Analysis + + +
+ The generalization performance of deep learning models for medical image +analysis often decreases on images collected with different devices for data +acquisition, device settings, or patient population. A better understanding of +the generalization capacity on new images is crucial for clinicians' +trustworthiness in deep learning. Although significant research efforts have +been recently directed toward establishing generalization bounds and complexity +measures, still, there is often a significant discrepancy between the predicted +and actual generalization performance. As well, related large empirical studies +have been primarily based on validation with general-purpose image datasets. +This paper presents an empirical study that investigates the correlation +between 25 complexity measures and the generalization abilities of supervised +deep learning classifiers for breast ultrasound images. The results indicate +that PAC-Bayes flatness-based and path norm-based measures produce the most +consistent explanation for the combination of models and data. We also +investigate the use of multi-task classification and segmentation approach for +breast images, and report that such learning approach acts as an implicit +regularizer and is conducive toward improved generalization. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ IST-Net: Prior-free Category-level Pose Estimation with Implicit Space + Transformation ICCV2023 + + +
+ Category-level 6D pose estimation aims to predict the poses and sizes of +unseen objects from a specific category. Thanks to prior deformation, which +explicitly adapts a category-specific 3D prior (i.e., a 3D template) to a given +object instance, prior-based methods attained great success and have become a +major research stream. However, obtaining category-specific priors requires +collecting a large amount of 3D models, which is labor-consuming and often not +accessible in practice. This motivates us to investigate whether priors are +necessary to make prior-based methods effective. Our empirical study shows that +the 3D prior itself is not the credit to the high performance. The keypoint +actually is the explicit deformation process, which aligns camera and world +coordinates supervised by world-space 3D models (also called canonical space). +Inspired by these observations, we introduce a simple prior-free implicit space +transformation network, namely IST-Net, to transform camera-space features to +world-space counterparts and build correspondence between them in an implicit +manner without relying on 3D priors. Besides, we design camera- and world-space +enhancers to enrich the features with pose-sensitive information and +geometrical constraints, respectively. Albeit simple, IST-Net achieves +state-of-the-art performance based-on prior-free design, with top inference +speed on the REAL275 benchmark. Our code and models are available at +https://github.com/CVMI-Lab/IST-Net. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Self-Supervised Learning for Videos: A Survey + + +
+ The remarkable success of deep learning in various domains relies on the +availability of large-scale annotated datasets. However, obtaining annotations +is expensive and requires great effort, which is especially challenging for +videos. Moreover, the use of human-generated annotations leads to models with +biased learning and poor domain generalization and robustness. As an +alternative, self-supervised learning provides a way for representation +learning which does not require annotations and has shown promise in both image +and video domains. Different from the image domain, learning video +representations are more challenging due to the temporal dimension, bringing in +motion and other environmental dynamics. This also provides opportunities for +video-exclusive ideas that advance self-supervised learning in the video and +multimodal domain. In this survey, we provide a review of existing approaches +on self-supervised learning focusing on the video domain. We summarize these +methods into four different categories based on their learning objectives: 1) +pretext tasks, 2) generative learning, 3) contrastive learning, and 4) +cross-modal agreement. We further introduce the commonly used datasets, +downstream evaluation tasks, insights into the limitations of existing works, +and the potential future directions in this area. + +
+
+ comment: ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q +
+
+
+
+
+ + ♻ ☆ CREPE: Learnable Prompting With CLIP Improves Visual Relationship + Prediction + + +
+ In this paper, we explore the potential of Vision-Language Models (VLMs), +specifically CLIP, in predicting visual object relationships, which involves +interpreting visual features from images into language-based relations. Current +state-of-the-art methods use complex graphical models that utilize language +cues and visual features to address this challenge. We hypothesize that the +strong language priors in CLIP embeddings can simplify these graphical models +paving for a simpler approach. We adopt the UVTransE relation prediction +framework, which learns the relation as a translational embedding with subject, +object, and union box embeddings from a scene. We systematically explore the +design of CLIP-based subject, object, and union-box representations within the +UVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate +Estimation). CREPE utilizes text-based representations for all three bounding +boxes and introduces a novel contrastive training strategy to automatically +infer the text prompt for union-box. Our approach achieves state-of-the-art +performance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual +Genome benchmark, achieving a 15.3\% gain in performance over recent +state-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in +object relation prediction and encourages further research on VLMs in this +challenging domain. + +
+
+
+
+
+ + ♻ ☆ iSLAM: Imperative SLAM + + +
+ Simultaneous localization and mapping (SLAM) stands as one of the critical +challenges in robot navigation. Recent advancements suggest that methods based +on supervised learning deliver impressive performance in front-end odometry, +while traditional optimization-based methods still play a vital role in the +back-end for minimizing estimation drift. In this paper, we found that such +decoupled paradigm can lead to only sub-optimal performance, consequently +curtailing system capabilities and generalization potential. To solve this +problem, we proposed a novel self-supervised learning framework, imperative +SLAM (iSLAM), which fosters reciprocal correction between the front-end and +back-end, thus enhancing performance without necessitating any external +supervision. Specifically, we formulate a SLAM system as a bi-level +optimization problem so that the two components are bidirectionally connected. +As a result, the front-end model is able to learn global geometric knowledge +obtained through pose graph optimization by back-propagating the residuals from +the back-end. This significantly improves the generalization ability of the +entire system and thus achieves the accuracy improvement up to 45%. To the best +of our knowledge, iSLAM is the first SLAM system showing that the front-end and +back-end can learn jointly and mutually contribute to each other in a +self-supervised manner. + +
+
+
+
+
+ + ♻ ☆ Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event + Localization + + +
+ Audio-Visual Event Localization (AVEL) is the task of temporally localizing +and classifying \emph{audio-visual events}, i.e., events simultaneously visible +and audible in a video. In this paper, we solve AVEL in a weakly-supervised +setting, where only video-level event labels (their presence/absence, but not +their locations in time) are available as supervision for training. Our idea is +to use a base model to estimate labels on the training data at a finer temporal +resolution than at the video level and re-train the model with these labels. +I.e., we determine the subset of labels for each \emph{slice} of frames in a +training video by (i) replacing the frames outside the slice with those from a +second video having no overlap in video-level labels, and (ii) feeding this +synthetic video into the base model to extract labels for just the slice in +question. To handle the out-of-distribution nature of our synthetic videos, we +propose an auxiliary objective for the base model that induces more reliable +predictions of the localized event labels as desired. Our three-stage pipeline +outperforms several existing AVEL methods with no architectural changes and +improves performance on a related weakly-supervised task as well. + +
+
+
+
+
+ + ♻ ☆ A comparative analysis of SRGAN models + + +
+ In this study, we evaluate the performance of multiple state-of-the-art SRGAN +(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN +and EDSR, on a benchmark dataset of real-world images which undergo degradation +using a pipeline. Our results show that some models seem to significantly +increase the resolution of the input images while preserving their visual +quality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE +model from huggingface outperforms the remaining candidate models in terms of +both quantitative metrics and subjective visual quality assessments with least +compute overhead. Specifically, EDSR generates images with higher peak +signal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and +are seen to return high quality OCR results with Tesseract OCR engine. These +findings suggest that EDSR is a robust and effective approach for single-image +super-resolution and may be particularly well-suited for applications where +high-quality visual fidelity is critical and optimized compute. + +
+
+ comment: 9 pages, 6 tables, 2 figures +
+
+
+
+
+ + ♻ ☆ Towards the Sparseness of Projection Head in Self-Supervised Learning + + +
+ In recent years, self-supervised learning (SSL) has emerged as a promising +approach for extracting valuable representations from unlabeled data. One +successful SSL method is contrastive learning, which aims to bring positive +examples closer while pushing negative examples apart. Many current contrastive +learning approaches utilize a parameterized projection head. Through a +combination of empirical analysis and theoretical investigation, we provide +insights into the internal mechanisms of the projection head and its +relationship with the phenomenon of dimensional collapse. Our findings +demonstrate that the projection head enhances the quality of representations by +performing contrastive loss in a projected subspace. Therefore, we propose an +assumption that only a subset of features is necessary when minimizing the +contrastive loss of a mini-batch of data. Theoretical analysis further suggests +that a sparse projection head can enhance generalization, leading us to +introduce SparseHead - a regularization term that effectively constrains the +sparsity of the projection head, and can be seamlessly integrated with any +self-supervised learning (SSL) approaches. Our experimental results validate +the effectiveness of SparseHead, demonstrating its ability to improve the +performance of existing contrastive methods. + +
+
+ comment: 9 pages,3 figures +
+
+
+
+
+ + ♻ ☆ M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models + and Latent Space Geometry Optimization MICCAI 2023 + + +
+ Medical vision-language models enable co-learning and integrating features +from medical imaging and clinical text. However, these models are not easy to +train and the latent representation space can be complex. Here we propose a +novel way for pre-training and regularising medical vision-language models. The +proposed method, named Medical vision-language pre-training with Frozen +language models and Latent spAce Geometry optimization (M-FLAG), leverages a +frozen language model for training stability and efficiency and introduces a +novel orthogonality loss to harmonize the latent space geometry. We demonstrate +the potential of the pre-trained model on three downstream tasks: medical image +classification, segmentation, and object detection. Extensive experiments +across five public datasets demonstrate that M-FLAG significantly outperforms +existing medical vision-language pre-training approaches and reduces the number +of parameters by 78\%. Notably, M-FLAG achieves outstanding performance on the +segmentation task while using only 1\% of the RSNA dataset, even outperforming +ImageNet pre-trained models that have been fine-tuned using 100\% of the data. + +
+
+ comment: Accepted by MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Disentangle then Parse:Night-time Semantic Segmentation with + Illumination Disentanglement ICCV2023 + + +
+ Most prior semantic segmentation methods have been developed for day-time +scenes, while typically underperforming in night-time scenes due to +insufficient and complicated lighting conditions. In this work, we tackle this +challenge by proposing a novel night-time semantic segmentation paradigm, i.e., +disentangle then parse (DTP). DTP explicitly disentangles night-time images +into light-invariant reflectance and light-specific illumination components and +then recognizes semantics based on their adaptive fusion. Concretely, the +proposed DTP comprises two key components: 1) Instead of processing +lighting-entangled features as in prior works, our Semantic-Oriented +Disentanglement (SOD) framework enables the extraction of reflectance component +without being impeded by lighting, allowing the network to consistently +recognize the semantics under cover of varying and complicated lighting +conditions. 2) Based on the observation that the illumination component can +serve as a cue for some semantically confused regions, we further introduce an +Illumination-Aware Parser (IAParser) to explicitly learn the correlation +between semantics and lighting, and aggregate the illumination features to +yield more precise predictions. Extensive experiments on the night-time +segmentation task with various settings demonstrate that DTP significantly +outperforms state-of-the-art methods. Furthermore, with negligible additional +parameters, DTP can be directly used to benefit existing day-time methods for +night-time segmentation. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for + Survival Outcome Prediction from PET/CT Images + + +
+ Survival prediction is a major concern for cancer management. Deep survival +models based on deep learning have been widely adopted to perform end-to-end +survival prediction from medical images. Recent deep survival models achieved +promising performance by jointly performing tumor segmentation with survival +prediction, where the models were guided to extract tumor-related information +through Multi-Task Learning (MTL). However, these deep survival models have +difficulties in exploring out-of-tumor prognostic information. In addition, +existing deep survival models are unable to effectively leverage multi-modality +images. Empirically-designed fusion strategies were commonly adopted to fuse +multi-modality information via task-specific manually-designed networks, thus +limiting the adaptability to different scenarios. In this study, we propose an +Adaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival +prediction from PET/CT images. Instead of adopting MTL, we propose a novel +Segmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained +for tumor segmentation and survival prediction sequentially in two stages. This +strategy enables the AdaMSS to focus on tumor regions in the first stage and +gradually expand its focus to include other prognosis-related regions in the +second stage. We also propose a data-driven strategy to fuse multi-modality +information, which realizes adaptive optimization of fusion strategies based on +training data during training. With the SSL and data-driven fusion strategies, +our AdaMSS is designed as an adaptive model that can self-adapt its focus +regions and fusion strategy for different training stages. Extensive +experiments with two large clinical datasets show that our AdaMSS outperforms +state-of-the-art survival prediction methods. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Mining Negative Temporal Contexts For False Positive Suppression In + Real-Time Ultrasound Lesion Detection MICCAI 2023 + + +
+ During ultrasonic scanning processes, real-time lesion detection can assist +radiologists in accurate cancer diagnosis. However, this essential task remains +challenging and underexplored. General-purpose real-time object detection +models can mistakenly report obvious false positives (FPs) when applied to +ultrasound videos, potentially misleading junior radiologists. One key issue is +their failure to utilize negative symptoms in previous frames, denoted as +negative temporal contexts (NTC). To address this issue, we propose to extract +contexts from previous frames, including NTC, with the guidance of inverse +optical flow. By aggregating extracted contexts, we endow the model with the +ability to suppress FPs by leveraging NTC. We call the resulting model +UltraDet. The proposed UltraDet demonstrates significant improvement over +previous state-of-the-arts and achieves real-time inference speed. We release +the code, checkpoints, and high-quality labels of the CVA-BUS dataset in +https://github.com/HaojunYu1998/UltraDet. + +
+
+ comment: 10 pages, 4 figures, MICCAI 2023 Early Accept +
+
+
+
+
+ + ♻ ☆ MixPath: A Unified Approach for One-shot Neural Architecture Search ICCV2023 + + +
+ Blending multiple convolutional kernels is proved advantageous in neural +architecture design. However, current two-stage neural architecture search +methods are mainly limited to single-path search spaces. How to efficiently +search models of multi-path structures remains a difficult problem. In this +paper, we are motivated to train a one-shot multi-path supernet to accurately +evaluate the candidate architectures. Specifically, we discover that in the +studied search spaces, feature vectors summed from multiple paths are nearly +multiples of those from a single path. Such disparity perturbs the supernet +training and its ranking ability. Therefore, we propose a novel mechanism +called Shadow Batch Normalization (SBN) to regularize the disparate feature +statistics. Extensive experiments prove that SBNs are capable of stabilizing +the optimization and improving ranking performance. We call our unified +multi-path one-shot approach as MixPath, which generates a series of models +that achieve state-of-the-art results on ImageNet. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ Multimodal brain age estimation using interpretable adaptive + population-graph learning MICCAI 2023 + + +
+ Brain age estimation is clinically important as it can provide valuable +information in the context of neurodegenerative diseases such as Alzheimer's. +Population graphs, which include multimodal imaging information of the subjects +along with the relationships among the population, have been used in literature +along with Graph Convolutional Networks (GCNs) and have proved beneficial for a +variety of medical imaging tasks. A population graph is usually static and +constructed manually using non-imaging information. However, graph construction +is not a trivial task and might significantly affect the performance of the +GCN, which is inherently very sensitive to the graph structure. In this work, +we propose a framework that learns a population graph structure optimized for +the downstream task. An attention mechanism assigns weights to a set of imaging +and non-imaging features (phenotypes), which are then used for edge extraction. +The resulting graph is used to train the GCN. The entire pipeline can be +trained end-to-end. Additionally, by visualizing the attention weights that +were the most important for the graph construction, we increase the +interpretability of the graph. We use the UK Biobank, which provides a large +variety of neuroimaging and non-imaging phenotypes, to evaluate our method on +brain age regression and classification. The proposed method outperforms +competing static graph approaches and other state-of-the-art adaptive methods. +We further show that the assigned attention scores indicate that there are both +imaging and non-imaging phenotypes that are informative for brain age +estimation and are in agreement with the relevant literature. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Schema Inference for Interpretable Image Classification + + +
+ In this paper, we study a novel inference paradigm, termed as schema +inference, that learns to deductively infer the explainable predictions by +rebuilding the prior deep neural network (DNN) forwarding scheme, guided by the +prevalent philosophical cognitive concept of schema. We strive to reformulate +the conventional model inference pipeline into a graph matching policy that +associates the extracted visual concepts of an image with the pre-computed +scene impression, by analogy with human reasoning mechanism via impression +matching. To this end, we devise an elaborated architecture, termed as +SchemaNet, as a dedicated instantiation of the proposed schema inference +concept, that models both the visual semantics of input instances and the +learned abstract imaginations of target categories as topological relational +graphs. Meanwhile, to capture and leverage the compositional contributions of +visual semantics in a global view, we also introduce a universal Feat2Graph +scheme in SchemaNet to establish the relational graphs that contain abundant +interaction information. Both the theoretical analysis and the experimental +results on several benchmarks demonstrate that the proposed schema inference +achieves encouraging performance and meanwhile yields a clear picture of the +deductive process leading to the predictions. Our code is available at +https://github.com/zhfeing/SchemaNet-PyTorch. + +
+
+
+
+
+ + ♻ ☆ I See Dead People: Gray-Box Adversarial Attack on Image-To-Text Models + + +
+ Modern image-to-text systems typically adopt the encoder-decoder framework, +which comprises two main components: an image encoder, responsible for +extracting image features, and a transformer-based decoder, used for generating +captions. Taking inspiration from the analysis of neural networks' robustness +against adversarial perturbations, we propose a novel gray-box algorithm for +creating adversarial examples in image-to-text models. Unlike image +classification tasks that have a finite set of class labels, finding visually +similar adversarial examples in an image-to-text task poses greater challenges +because the captioning system allows for a virtually infinite space of possible +captions. In this paper, we present a gray-box adversarial attack on +image-to-text, both untargeted and targeted. We formulate the process of +discovering adversarial perturbations as an optimization problem that uses only +the image-encoder component, meaning the proposed attack is language-model +agnostic. Through experiments conducted on the ViT-GPT2 model, which is the +most-used image-to-text model in Hugging Face, and the Flickr30k dataset, we +demonstrate that our proposed attack successfully generates visually similar +adversarial examples, both with untargeted and targeted captions. Notably, our +attack operates in a gray-box manner, requiring no knowledge about the decoder +module. We also show that our attacks fool the popular open-source platform +Hugging Face. + +
+
+
+
+
+ + ♻ ☆ RAR: Region-Aware Point Cloud Registration + + +
+ This paper concerns the research problem of point cloud registration to find +the rigid transformation to optimally align the source point set with the +target one. Learning robust point cloud registration models with deep neural +networks has emerged as a powerful paradigm, offering promising performance in +predicting the global geometric transformation for a pair of point sets. +Existing methods firstly leverage an encoder to regress a latent shape +embedding, which is then decoded into a shape-conditioned transformation via +concatenation-based conditioning. However, different regions of a 3D shape vary +in their geometric structures which makes it more sense that we have a +region-conditioned transformation instead of the shape-conditioned one. In this +paper we present a \underline{R}egion-\underline{A}ware point cloud +\underline{R}egistration, denoted as RAR, to predict transformation for +pairwise point sets in the self-supervised learning fashion. More specifically, +we develop a novel region-aware decoder (RAD) module that is formed with an +implicit neural region representation parameterized by neural networks. The +implicit neural region representation is learned with a self-supervised 3D +shape reconstruction loss without the need for region labels. Consequently, the +region-aware decoder (RAD) module guides the training of the region-aware +transformation (RAT) module and region-aware weight (RAW) module, which predict +the transforms and weights for different regions respectively. The global +geometric transformation from source point set to target one is then formed by +the weighted fusion of region-aware transforms. Compared to the +state-of-the-art approaches, our experiments show that our RAR achieves +superior registration performance over various benchmark datasets (e.g. +ModelNet40). + +
+
+ comment: arXiv admin note: text overlap with arXiv:2006.06200 +
+
+
+
+
+ + ♻ ☆ Reduction of Class Activation Uncertainty with Background Information + + +
+ Multitask learning is a popular approach to training high-performing neural +networks with improved generalization. In this paper, we propose a background +class to achieve improved generalization at a lower computation compared to +multitask learning to help researchers and organizations with limited +computation power. We also present a methodology for selecting background +images and discuss potential future improvements. We apply our approach to +several datasets and achieved improved generalization with much lower +computation. We also investigate class activation mappings (CAMs) of the +trained model and observed the tendency towards looking at a bigger picture in +a few class classification problems with the proposed model training +methodology. Applying transformer with the proposed background class, we +receive state-of-the-art (SOTA) performance on STL-10, Caltech-101, and +CINIC-10 datasets. Example scripts are available in the `CAM' folder of the +following GitHub Repository: github.com/dipuk0506/UQ + +
+
+
+
+
+ + ♻ ☆ CB-HVTNet: A channel-boosted hybrid vision transformer network for + lymphocyte assessment in histopathological images + + +
+ Transformers, due to their ability to learn long range dependencies, have +overcome the shortcomings of convolutional neural networks (CNNs) for global +perspective learning. Therefore, they have gained the focus of researchers for +several vision related tasks including medical diagnosis. However, their +multi-head attention module only captures global level feature representations, +which is insufficient for medical images. To address this issue, we propose a +Channel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning +to generate boosted channels and employs both transformers and CNNs to analyse +lymphocytes in histopathological images. The proposed CB HVT comprises five +modules, including a channel generation module, channel exploitation module, +channel merging module, region-aware module, and a detection and segmentation +head, which work together to effectively identify lymphocytes. The channel +generation module uses the idea of channel boosting through transfer learning +to extract diverse channels from different auxiliary learners. In the CB HVT, +these boosted channels are first concatenated and ranked using an attention +mechanism in the channel exploitation module. A fusion block is then utilized +in the channel merging module for a gradual and systematic merging of the +diverse boosted channels to improve the network's learning representations. The +CB HVT also employs a proposal network in its region aware module and a head to +effectively identify objects, even in overlapping regions and with artifacts. +We evaluated the proposed CB HVT on two publicly available datasets for +lymphocyte assessment in histopathological images. The results show that CB HVT +outperformed other state of the art detection models, and has good +generalization ability, demonstrating its value as a tool for pathologists. + +
+
+
+
+
+ + ♻ ☆ Leveraging triplet loss for unsupervised action segmentation CVPR + + +
+ In this paper, we propose a novel fully unsupervised framework that learns +action representations suitable for the action segmentation task from the +single input video itself, without requiring any training data. Our method is a +deep metric learning approach rooted in a shallow network with a triplet loss +operating on similarity distributions and a novel triplet selection strategy +that effectively models temporal and semantic priors to discover actions in the +new representational space. Under these circumstances, we successfully recover +temporal boundaries in the learned action representations with higher quality +compared with existing unsupervised approaches. The proposed method is +evaluated on two widely used benchmark datasets for the action segmentation +task and it achieves competitive performance by applying a generic clustering +algorithm on the learned representations. + +
+
+ comment: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) Workshops, 2023, pp. 4921-4929 +
+
+
+
+
+ + ♻ ☆ The MONET dataset: Multimodal drone thermal dataset recorded in rural + scenarios CVPR + + +
+ We present MONET, a new multimodal dataset captured using a thermal camera +mounted on a drone that flew over rural areas, and recorded human and vehicle +activities. We captured MONET to study the problem of object localisation and +behaviour understanding of targets undergoing large-scale variations and being +recorded from different and moving viewpoints. Target activities occur in two +different land sites, each with unique scene structures and cluttered +backgrounds. MONET consists of approximately 53K images featuring 162K manually +annotated bounding boxes. Each image is timestamp-aligned with drone metadata +that includes information about attitudes, speed, altitude, and GPS +coordinates. MONET is different from previous thermal drone datasets because it +features multimodal data, including rural scenes captured with thermal cameras +containing both person and vehicle targets, along with trajectory information +and metadata. We assessed the difficulty of the dataset in terms of transfer +learning between the two sites and evaluated nine object detection algorithms +to identify the open challenges associated with this type of data. Project +page: https://github.com/fabiopoiesi/monet_dataset. + +
+
+ comment: Published in Computer Vision and Pattern Recognition (CVPR) Workshops + 2023 - 6th Multimodal Learning and Applications Workshop +
+
+
+
+
+ + ♻ ☆ Neural Fields for Interactive Visualization of Statistical Dependencies + in 3D Simulation Ensembles + + +
+ We present the first neural network that has learned to compactly represent +and can efficiently reconstruct the statistical dependencies between the values +of physical variables at different spatial locations in large 3D simulation +ensembles. Going beyond linear dependencies, we consider mutual information as +a measure of non-linear dependence. We demonstrate learning and reconstruction +with a large weather forecast ensemble comprising 1000 members, each storing +multiple physical variables at a 250 x 352 x 20 simulation grid. By +circumventing compute-intensive statistical estimators at runtime, we +demonstrate significantly reduced memory and computation requirements for +reconstructing the major dependence structures. This enables embedding the +estimator into a GPU-accelerated direct volume renderer and interactively +visualizing all mutual dependencies for a selected domain point. + +
+
+
+
+
+ + ♻ ☆ Bidirectional Temporal Diffusion Model for Temporally Consistent Human + Animation + + +
+ We introduce a method to generate temporally coherent human animation from a +single image, a video, or a random noise. This problem has been formulated as +modeling of an auto-regressive generation, i.e., to regress past frames to +decode future frames. However, such unidirectional generation is highly prone +to motion drifting over time, generating unrealistic human animation with +significant artifacts such as appearance distortion. We claim that +bidirectional temporal modeling enforces temporal coherence on a generative +network by largely suppressing the motion ambiguity of human appearance. To +prove our claim, we design a novel human animation framework using a denoising +diffusion model: a neural network learns to generate the image of a person by +denoising temporal Gaussian noises whose intermediate results are +cross-conditioned bidirectionally between consecutive frames. In the +experiments, our method demonstrates strong performance compared to existing +unidirectional approaches with realistic temporal coherence + +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial + Transferability From Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21 + pages, 12 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Hierarchically Decomposed Graph Convolutional Networks for + Skeleton-Based Action Recognition ICCV 2023 + + +
+ Graph convolutional networks (GCNs) are the most commonly used methods for +skeleton-based action recognition and have achieved remarkable performance. +Generating adjacency matrices with semantically meaningful edges is +particularly important for this task, but extracting such edges is challenging +problem. To solve this, we propose a hierarchically decomposed graph +convolutional network (HD-GCN) architecture with a novel hierarchically +decomposed graph (HD-Graph). The proposed HD-GCN effectively decomposes every +joint node into several sets to extract major structurally adjacent and distant +edges, and uses them to construct an HD-Graph containing those edges in the +same semantic spaces of a human skeleton. In addition, we introduce an +attention-guided hierarchy aggregation (A-HA) module to highlight the dominant +hierarchical edge sets of the HD-Graph. Furthermore, we apply a new six-way +ensemble method, which uses only joint and bone stream without any motion +stream. The proposed model is evaluated and achieves state-of-the-art +performance on four large, popular datasets. Finally, we demonstrate the +effectiveness of our model with various comparative experiments. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ YOLIC: An Efficient Method for Object Localization and Classification on + Edge Devices + + +
+ In the realm of Tiny AI, we introduce "You Only Look at Interested Cells" +(YOLIC), an efficient method for object localization and classification on edge +devices. Seamlessly blending the strengths of semantic segmentation and object +detection, YOLIC offers superior computational efficiency and precision. By +adopting Cells of Interest for classification instead of individual pixels, +YOLIC encapsulates relevant information, reduces computational load, and +enables rough object shape inference. Importantly, the need for bounding box +regression is obviated, as YOLIC capitalizes on the predetermined cell +configuration that provides information about potential object location, size, +and shape. To tackle the issue of single-label classification limitations, a +multi-label classification approach is applied to each cell, effectively +recognizing overlapping or closely situated objects. This paper presents +extensive experiments on multiple datasets, demonstrating that YOLIC achieves +detection performance comparable to the state-of-the-art YOLO algorithms while +surpassing in speed, exceeding 30fps on a Raspberry Pi 4B CPU. All resources +related to this study, including datasets, cell designer, image annotation +tool, and source code, have been made publicly available on our project website +at https://kai3316.github.io/yolic.github.io + +
+
+
+
+
+ + ♻ ☆ Boundary Distribution Estimation for Precise Object Detection + + +
+ In the field of state-of-the-art object detection, the task of object +localization is typically accomplished through a dedicated subnet that +emphasizes bounding box regression. This subnet traditionally predicts the +object's position by regressing the box's center position and scaling factors. +Despite the widespread adoption of this approach, we have observed that the +localization results often suffer from defects, leading to unsatisfactory +detector performance. In this paper, we address the shortcomings of previous +methods through theoretical analysis and experimental verification and present +an innovative solution for precise object detection. Instead of solely focusing +on the object's center and size, our approach enhances the accuracy of bounding +box localization by refining the box edges based on the estimated distribution +at the object's boundary. Experimental results demonstrate the potential and +generalizability of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed + Tomography via Deep Convolutional Neural Network based Artifact Reduction + + +
+ Purpose: Sparse-view computed tomography (CT) is an effective way to reduce +dose by lowering the total number of views acquired, albeit at the expense of +image quality, which, in turn, can impact the ability to detect diseases. We +explore deep learning-based artifact reduction in sparse-view cranial CT scans +and its impact on automated hemorrhage detection. Methods: We trained a U-Net +for artefact reduction on simulated sparse-view cranial CT scans from 3000 +patients obtained from a public dataset and reconstructed with varying levels +of sub-sampling. Additionally, we trained a convolutional neural network on +fully sampled CT data from 17,545 patients for automated hemorrhage detection. +We evaluated the classification performance using the area under the receiver +operator characteristic curves (AUC-ROCs) with corresponding 95% confidence +intervals (CIs) and the DeLong test, along with confusion matrices. The +performance of the U-Net was compared to an analytical approach based on total +variation (TV). Results: The U-Net performed superior compared to unprocessed +and TV-processed images with respect to image quality and automated hemorrhage +diagnosis. With U-Net post-processing, the number of views can be reduced from +4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973; +0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256 +views (0.967; 0.964-0.969) with a slight performance decrease (P<.001). +Conclusion: The results suggest that U-Net based artifact reduction +substantially enhances automated hemorrhage detection in sparse-view cranial +CTs. Our findings highlight that appropriate post-processing is crucial for +optimal image quality and diagnostic accuracy while minimizing radiation dose. + +
+
+ comment: 11 pages, 6 figures, 1 table +
+
+
+
+
+ + ♻ ☆ MotionBERT: A Unified Perspective on Learning Human Motion + Representations ICCV 2023 + + +
+ We present a unified perspective on tackling various human-centric video +tasks by learning human motion representations from large-scale and +heterogeneous data resources. Specifically, we propose a pretraining stage in +which a motion encoder is trained to recover the underlying 3D motion from +noisy partial 2D observations. The motion representations acquired in this way +incorporate geometric, kinematic, and physical knowledge about human motion, +which can be easily transferred to multiple downstream tasks. We implement the +motion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer) +neural network. It could capture long-range spatio-temporal relationships among +the skeletal joints comprehensively and adaptively, exemplified by the lowest +3D pose estimation error so far when trained from scratch. Furthermore, our +proposed framework achieves state-of-the-art performance on all three +downstream tasks by simply finetuning the pretrained motion encoder with a +simple regression head (1-2 layers), which demonstrates the versatility of the +learned motion representations. Code and models are available at +https://motionbert.github.io/ + +
+
+ comment: ICCV 2023 version +
+
+
+
+
+ + ♻ ☆ Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation + Evaluation + + +
+ Research in Image Generation has recently made significant progress, +particularly boosted by the introduction of Vision-Language models which are +able to produce high-quality visual content based on textual inputs. Despite +ongoing advancements in terms of generation quality and realism, no methodical +frameworks have been defined yet to quantitatively measure the quality of the +generated content and the adherence with the prompted requests: so far, only +human-based evaluations have been adopted for quality satisfaction and for +comparing different generative methods. We introduce a novel automated method +for Visual Concept Evaluation (ViCE), i.e. to assess consistency between a +generated/edited image and the corresponding prompt/instructions, with a +process inspired by the human cognitive behaviour. ViCE combines the strengths +of Large Language Models (LLMs) and Visual Question Answering (VQA) into a +unified pipeline, aiming to replicate the human cognitive process in quality +assessment. This method outlines visual concepts, formulates image-specific +verification questions, utilizes the Q&A system to investigate the image, and +scores the combined outcome. Although this brave new hypothesis of mimicking +humans in the image evaluation process is in its preliminary assessment stage, +results are promising and open the door to a new form of automatic evaluation +which could have significant impact as the image generation or the image target +editing tasks become more and more sophisticated. + +
+
+ comment: Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track) +
+
+
+
+
+ + ♻ ☆ Super Vision Transformer + + +
+ We attempt to reduce the computational costs in vision transformers (ViTs), +which increase quadratically in the token number. We present a novel training +paradigm that trains only one ViT model at a time, but is capable of providing +improved image recognition performance with various computational costs. Here, +the trained ViT model, termed super vision transformer (SuperViT), is empowered +with the versatile ability to solve incoming patches of multiple sizes as well +as preserve informative tokens with multiple keeping rates (the ratio of +keeping tokens) to achieve good hardware efficiency for inference, given that +the available hardware resources often change from time to time. Experimental +results on ImageNet demonstrate that our SuperViT can considerably reduce the +computational costs of ViT models with even performance increase. For example, +we reduce 2x FLOPs of DeiT-S while increasing the Top-1 accuracy by 0.2% and +0.7% for 1.5x reduction. Also, our SuperViT significantly outperforms existing +studies on efficient vision transformers. For example, when consuming the same +amount of FLOPs, our SuperViT surpasses the recent state-of-the-art (SOTA) EViT +by 1.1% when using DeiT-S as their backbones. The project of this work is made +publicly available at https://github.com/lmbxmu/SuperViT. + +
+
+ comment: Accepted by International Journal of Computer Vision (IJCV) in the + year of 2023 +
+
+
+
+
+ + ♻ ☆ MELON: NeRF with Unposed Images in SO(3) + + +
+ Neural radiance fields enable novel-view synthesis and scene reconstruction +with photorealistic quality from a few images, but require known and accurate +camera poses. Conventional pose estimation algorithms fail on smooth or +self-similar scenes, while methods performing inverse rendering from unposed +views require a rough initialization of the camera orientations. The main +difficulty of pose estimation lies in real-life objects being almost invariant +under certain transformations, making the photometric distance between rendered +views non-convex with respect to the camera parameters. Using an equivalence +relation that matches the distribution of local minima in camera space, we +reduce this space to its quotient set, in which pose estimation becomes a more +convex problem. Using a neural-network to regularize pose estimation, we +demonstrate that our method - MELON - can reconstruct a neural radiance field +from unposed images with state-of-the-art accuracy while requiring ten times +fewer views than adversarial approaches. + +
+
+
+
+
+ + ♻ ☆ TinyTracker: Ultra-Fast and Ultra-Low-Power Edge Vision In-Sensor for + Gaze Estimation + + +
+ Intelligent edge vision tasks encounter the critical challenge of ensuring +power and latency efficiency due to the typically heavy computational load they +impose on edge platforms.This work leverages one of the first "AI in sensor" +vision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power +end-to-end edge vision applications. We evaluate the IMX500 and compare it to +other edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by +exploring gaze estimation as a case study. We propose TinyTracker, a highly +efficient, fully quantized model for 2D gaze estimation designed to maximize +the performance of the edge vision systems considered in this study. +TinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1] +without significant loss in gaze estimation accuracy (maximum of 0.16 cm when +fully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor +results in end-to-end latency of around 19ms. The camera takes around 17.9ms to +read, process and transmit the pixels to the accelerator. The inference time of +the network is 0.86ms with an additional 0.24 ms for retrieving the results +from the sensor. The overall energy consumption of the end-to-end system is 4.9 +mJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is +1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ +VS 34.2mJ) + +
+
+
+
+
+ + ♻ ☆ Revisiting Scene Text Recognition: A Data Perspective ICCV2023 + + +
+ This paper aims to re-assess scene text recognition (STR) from a +data-oriented perspective. We begin by revisiting the six commonly used +benchmarks in STR and observe a trend of performance saturation, whereby only +2.91% of the benchmark images cannot be accurately recognized by an ensemble of +13 representative models. While these results are impressive and suggest that +STR could be considered solved, however, we argue that this is primarily due to +the less challenging nature of the common benchmarks, thus concealing the +underlying issues that STR faces. To this end, we consolidate a large-scale +real STR dataset, namely Union14M, which comprises 4 million labeled images and +10 million unlabeled images, to assess the performance of STR models in more +complex real-world scenarios. Our experiments demonstrate that the 13 models +can only achieve an average accuracy of 66.53% on the 4 million labeled images, +indicating that STR still faces numerous challenges in the real world. By +analyzing the error patterns of the 13 models, we identify seven open +challenges in STR and develop a challenge-driven benchmark consisting of eight +distinct subsets to facilitate further progress in the field. Our exploration +demonstrates that STR is far from being solved and leveraging data may be a +promising solution. In this regard, we find that utilizing the 10 million +unlabeled images through self-supervised pre-training can significantly improve +the robustness of STR model in real-world scenarios and leads to +state-of-the-art performance. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ♻ ☆ Improving the Transferability of Adversarial Attacks on Face Recognition + with Beneficial Perturbation Feature Augmentation + + +
+ Face recognition (FR) models can be easily fooled by adversarial examples, +which are crafted by adding imperceptible perturbations on benign face images. +The existence of adversarial face examples poses a great threat to the security +of society. In order to build a more sustainable digital nation, in this paper, +we improve the transferability of adversarial face examples to expose more +blind spots of existing FR models. Though generating hard samples has shown its +effectiveness in improving the generalization of models in training tasks, the +effectiveness of utilizing this idea to improve the transferability of +adversarial face examples remains unexplored. To this end, based on the +property of hard samples and the symmetry between training tasks and +adversarial attack tasks, we propose the concept of hard models, which have +similar effects as hard samples for adversarial attack tasks. Utilizing the +concept of hard models, we propose a novel attack method called Beneficial +Perturbation Feature Augmentation Attack (BPFA), which reduces the overfitting +of adversarial examples to surrogate FR models by constantly generating new +hard models to craft the adversarial examples. Specifically, in the +backpropagation, BPFA records the gradients on pre-selected feature maps and +uses the gradient on the input image to craft the adversarial example. In the +next forward propagation, BPFA leverages the recorded gradients to add +beneficial perturbations on their corresponding feature maps to increase the +loss. Extensive experiments demonstrate that BPFA can significantly boost the +transferability of adversarial attacks on FR. + +
+
+ comment: \c{opyright} 2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Making Substitute Models More Bayesian Can Enhance Transferability of + Adversarial Examples ICLR 2023 + + +
+ The transferability of adversarial examples across deep neural networks +(DNNs) is the crux of many black-box attacks. Many prior efforts have been +devoted to improving the transferability via increasing the diversity in inputs +of some substitute models. In this paper, by contrast, we opt for the diversity +in substitute models and advocate to attack a Bayesian model for achieving +desirable transferability. Deriving from the Bayesian formulation, we develop a +principled strategy for possible finetuning, which can be combined with many +off-the-shelf Gaussian posterior approximations over DNN parameters. Extensive +experiments have been conducted to verify the effectiveness of our method, on +common benchmark datasets, and the results demonstrate that our method +outperforms recent state-of-the-arts by large margins (roughly 19% absolute +increase in average attack success rate on ImageNet), and, by combining with +these recent methods, further performance gain can be obtained. Our code: +https://github.com/qizhangli/MoreBayesian-attack. + +
+
+ comment: Accepted by ICLR 2023, fix typos +
+
+
+
+
+ + ♻ ☆ Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via + Geometry-Guided Cross-View Transformer ICCV 2023 + + +
+ Image retrieval-based cross-view localization methods often lead to very +coarse camera pose estimation, due to the limited sampling density of the +database satellite images. In this paper, we propose a method to increase the +accuracy of a ground camera's location and orientation by estimating the +relative rotation and translation between the ground-level image and its +matched/retrieved satellite image. Our approach designs a geometry-guided +cross-view transformer that combines the benefits of conventional geometry and +learnable cross-view transformers to map the ground-view observations to an +overhead view. Given the synthesized overhead view and observed satellite +feature maps, we construct a neural pose optimizer with strong global +information embedding ability to estimate the relative rotation between them. +After aligning their rotations, we develop an uncertainty-guided spatial +correlation to generate a probability map of the vehicle locations, from which +the relative translation can be determined. Experimental results demonstrate +that our method significantly outperforms the state-of-the-art. Notably, the +likelihood of restricting the vehicle lateral pose to be within 1m of its +Ground Truth (GT) value on the cross-view KITTI dataset has been improved from +$35.54\%$ to $76.44\%$, and the likelihood of restricting the vehicle +orientation to be within $1^{\circ}$ of its GT value has been improved from +$19.64\%$ to $99.10\%$. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Unsupervised Video Anomaly Detection with Diffusion Models Conditioned + on Compact Motion Representations + + +
+ This paper aims to address the unsupervised video anomaly detection (VAD) +problem, which involves classifying each frame in a video as normal or +abnormal, without any access to labels. To accomplish this, the proposed method +employs conditional diffusion models, where the input data is the +spatiotemporal features extracted from a pre-trained network, and the condition +is the features extracted from compact motion representations that summarize a +given video segment in terms of its motion and appearance. Our method utilizes +a data-driven threshold and considers a high reconstruction error as an +indicator of anomalous events. This study is the first to utilize compact +motion representations for VAD and the experiments conducted on two large-scale +VAD benchmarks demonstrate that they supply relevant information to the +diffusion model, and consequently improve VAD performances w.r.t the prior art. +Importantly, our method exhibits better generalization performance across +different datasets, notably outperforming both the state-of-the-art and +baseline methods. The code of our method is available at +https://github.com/AnilOsmanTur/conditioned_video_anomaly_diffusion + +
+
+ comment: Accepted to ICIAP 2023 +
+
+
+
+
+ + ♻ ☆ Joint Demosaicking and Denoising Benefits from a Two-stage Training + Strategy + + +
+ Image demosaicking and denoising are the first two key steps of the color +image production pipeline. The classical processing sequence has for a long +time consisted of applying denoising first, and then demosaicking. Applying the +operations in this order leads to oversmoothing and checkerboard effects. Yet, +it was difficult to change this order, because once the image is demosaicked, +the statistical properties of the noise are dramatically changed and hard to +handle by traditional denoising models. In this paper, we address this problem +by a hybrid machine learning method. We invert the traditional color filter +array (CFA) processing pipeline by first demosaicking and then denoising. Our +demosaicking algorithm, trained on noiseless images, combines a traditional +method and a residual convolutional neural network (CNN). This first stage +retains all known information, which is the key point to obtain faithful final +results. The noisy demosaicked image is then passed through a second CNN +restoring a noiseless full-color image. This pipeline order completely avoids +checkerboard effects and restores fine image detail. Although CNNs can be +trained to solve jointly demosaicking-denoising end-to-end, we find that this +two-stage training performs better and is less prone to failure. It is shown +experimentally to improve on the state of the art, both quantitatively and in +terms of visual quality. + +
+
+ comment: 28 pages, 40 figures +
+
+
+
+
+ + ♻ ☆ Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation ICCV 2023 + + +
+ Low-light conditions not only hamper human visual experience but also degrade +the model's performance on downstream vision tasks. While existing works make +remarkable progress on day-night domain adaptation, they rely heavily on domain +knowledge derived from the task-specific nighttime dataset. This paper +challenges a more complicated scenario with border applicability, i.e., +zero-shot day-night domain adaptation, which eliminates reliance on any +nighttime data. Unlike prior zero-shot adaptation approaches emphasizing either +image-level translation or model-level adaptation, we propose a similarity +min-max paradigm that considers them under a unified framework. On the image +level, we darken images towards minimum feature similarity to enlarge the +domain gap. Then on the model level, we maximize the feature similarity between +the darkened images and their normal-light counterparts for better model +adaptation. To the best of our knowledge, this work represents the pioneering +effort in jointly optimizing both aspects, resulting in a significant +improvement of model generalizability. Extensive experiments demonstrate our +method's effectiveness and broad applicability on various nighttime vision +tasks, including classification, semantic segmentation, visual place +recognition, and video action recognition. Code and pre-trained models are +available at https://red-fairy.github.io/ZeroShotDayNightDA-Webpage/. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Iterative Scale-Up ExpansionIoU and Deep Features Association for + Multi-Object Tracking in Sports + + +
+ Multi-object tracking algorithms have made significant advancements due to +the recent developments in object detection. However, most existing methods +primarily focus on tracking pedestrians or vehicles, which exhibit relatively +simple and regular motion patterns. Consequently, there is a scarcity of +algorithms that address the tracking of targets with irregular or non-linear +motion, such as multi-athlete tracking. Furthermore, popular tracking +algorithms often rely on the Kalman filter for object motion modeling, which +fails to track objects when their motion contradicts the linear motion +assumption of the Kalman filter. Due to this reason, we proposed a novel online +and robust multi-object tracking approach, named Iterative Scale-Up +ExpansionIoU and Deep Features for multi-object tracking. Unlike conventional +methods, we abandon the use of the Kalman filter and propose utilizing the +iterative scale-up expansion IoU. This approach achieves superior tracking +performance without requiring additional training data or adopting a more +robust detector, all while maintaining a lower computational cost compared to +other appearance-based methods. Our proposed method demonstrates remarkable +effectiveness in tracking irregular motion objects, achieving a score of 76.9% +in HOTA. It outperforms all state-of-the-art tracking algorithms on the +SportsMOT dataset, covering various kinds of sport scenarios. + +
+
+
+
+
+ + ♻ ☆ Unified Adversarial Patch for Cross-modal Attacks in the Physical World ICCV2023 + + +
+ Recently, physical adversarial attacks have been presented to evade +DNNs-based object detectors. To ensure the security, many scenarios are +simultaneously deployed with visible sensors and infrared sensors, leading to +the failures of these single-modal physical attacks. To show the potential +risks under such scenes, we propose a unified adversarial patch to perform +cross-modal physical attacks, i.e., fooling visible and infrared object +detectors at the same time via a single patch. Considering different imaging +mechanisms of visible and infrared sensors, our work focuses on modeling the +shapes of adversarial patches, which can be captured in different modalities +when they change. To this end, we design a novel boundary-limited shape +optimization to achieve the compact and smooth shapes, and thus they can be +easily implemented in the physical world. In addition, to balance the fooling +degree between visible detector and infrared detector during the optimization +process, we propose a score-aware iterative evaluation, which can guide the +adversarial patch to iteratively reduce the predicted scores of the multi-modal +sensors. We finally test our method against the one-stage detector: YOLOv3 and +the two-stage detector: Faster RCNN. Results show that our unified patch +achieves an Attack Success Rate (ASR) of 73.33% and 69.17%, respectively. More +importantly, we verify the effective attacks in the physical world when visible +and infrared sensors shoot the objects under various settings like different +angles, distances, postures, and scenes. + +
+
+ comment: 10 pages, 8 figures, accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Multi-IMU with Online Self-Consistency for Freehand 3D Ultrasound + Reconstruction MICCAI-2023 + + +
+ Ultrasound (US) imaging is a popular tool in clinical diagnosis, offering +safety, repeatability, and real-time capabilities. Freehand 3D US is a +technique that provides a deeper understanding of scanned regions without +increasing complexity. However, estimating elevation displacement and +accumulation error remains challenging, making it difficult to infer the +relative position using images alone. The addition of external lightweight +sensors has been proposed to enhance reconstruction performance without adding +complexity, which has been shown to be beneficial. We propose a novel online +self-consistency network (OSCNet) using multiple inertial measurement units +(IMUs) to improve reconstruction performance. OSCNet utilizes a modal-level +self-supervised strategy to fuse multiple IMU information and reduce +differences between reconstruction results obtained from each IMU data. +Additionally, a sequence-level self-consistency strategy is proposed to improve +the hierarchical consistency of prediction results among the scanning sequence +and its sub-sequences. Experiments on large-scale arm and carotid datasets with +multiple scanning tactics demonstrate that our OSCNet outperforms previous +methods, achieving state-of-the-art reconstruction performance. + +
+
+ comment: Accepted by MICCAI-2023 +
+
+
+
+
+ + ♻ ☆ LA-Net: Landmark-Aware Learning for Reliable Facial Expression + Recognition under Label Noise ICCV 2023 + + +
+ Facial expression recognition (FER) remains a challenging task due to the +ambiguity of expressions. The derived noisy labels significantly harm the +performance in real-world scenarios. To address this issue, we present a new +FER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks +to mitigate the impact of label noise from two perspectives. Firstly, LA-Net +uses landmark information to suppress the uncertainty in expression space and +constructs the label distribution of each sample by neighborhood aggregation, +which in turn improves the quality of training supervision. Secondly, the model +incorporates landmark information into expression representations using the +devised expression-landmark contrastive loss. The enhanced expression feature +extractor can be less susceptible to label noise. Our method can be integrated +with any deep neural network for better training supervision without +introducing extra inference costs. We conduct extensive experiments on both +in-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net +achieves state-of-the-art performance. + +
+
+ comment: accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Leveraging Spatio-Temporal Dependency for Skeleton-Based Action + Recognition ICCV 2023 + + +
+ Skeleton-based action recognition has attracted considerable attention due to +its compact representation of the human body's skeletal sructure. Many recent +methods have achieved remarkable performance using graph convolutional networks +(GCNs) and convolutional neural networks (CNNs), which extract spatial and +temporal features, respectively. Although spatial and temporal dependencies in +the human skeleton have been explored separately, spatio-temporal dependency is +rarely considered. In this paper, we propose the Spatio-Temporal Curve Network +(STC-Net) to effectively leverage the spatio-temporal dependency of the human +skeleton. Our proposed network consists of two novel elements: 1) The +Spatio-Temporal Curve (STC) module; and 2) Dilated Kernels for Graph +Convolution (DK-GC). The STC module dynamically adjusts the receptive field by +identifying meaningful node connections between every adjacent frame and +generating spatio-temporal curves based on the identified node connections, +providing an adaptive spatio-temporal coverage. In addition, we propose DK-GC +to consider long-range dependencies, which results in a large receptive field +without any additional parameters by applying an extended kernel to the given +adjacency matrices of the graph. Our STC-Net combines these two modules and +achieves state-of-the-art performance on four skeleton-based action recognition +benchmarks. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Reinforced Disentanglement for Face Swapping without Skip Connection ICCV 2023 + + +
+ The SOTA face swap models still suffer the problem of either target identity +(i.e., shape) being leaked or the target non-identity attributes (i.e., +background, hair) failing to be fully preserved in the final results. We show +that this insufficient disentanglement is caused by two flawed designs that +were commonly adopted in prior models: (1) counting on only one compressed +encoder to represent both the semantic-level non-identity facial +attributes(i.e., pose) and the pixel-level non-facial region details, which is +contradictory to satisfy at the same time; (2) highly relying on long +skip-connections between the encoder and the final generator, leaking a certain +amount of target face identity into the result. To fix them, we introduce a new +face swap framework called 'WSC-swap' that gets rid of skip connections and +uses two target encoders to respectively capture the pixel-level non-facial +region attributes and the semantic non-identity attributes in the face region. +To further reinforce the disentanglement learning for the target encoder, we +employ both identity removal loss via adversarial training (i.e., GAN) and the +non-identity preservation loss via prior 3DMM models like [11]. Extensive +experiments on both FaceForensics++ and CelebA-HQ show that our results +significantly outperform previous works on a rich set of metrics, including one +novel metric for measuring identity consistency that was completely neglected +before. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution + Generalizability ICCV + + +
+ Large vision-language models have achieved outstanding performance, but their +size and computational requirements make their deployment on +resource-constrained devices and time-sensitive tasks impractical. Model +distillation, the process of creating smaller, faster models that maintain the +performance of larger models, is a promising direction towards the solution. +This paper investigates the distillation of visual representations in large +teacher vision-language models into lightweight student models using a small- +or mid-scale dataset. Notably, this study focuses on open-vocabulary +out-of-distribution (OOD) generalization, a challenging problem that has been +overlooked in previous model distillation literature. We propose two principles +from vision and language modality perspectives to enhance student's OOD +generalization: (1) by better imitating teacher's visual representation space, +and carefully promoting better coherence in vision-language alignment with the +teacher; (2) by enriching the teacher's language representations with +informative and finegrained semantic attributes to effectively distinguish +between different labels. We propose several metrics and conduct extensive +experiments to investigate their techniques. The results demonstrate +significant improvements in zero-shot and few-shot student performance on +open-vocabulary out-of-distribution classification, highlighting the +effectiveness of our proposed approaches. Code released at +https://github.com/xuanlinli17/large_vlm_distillation_ood + +
+
+ comment: Published at International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ OPHAvatars: One-shot Photo-realistic Head Avatars + + +
+ We propose a method for synthesizing photo-realistic digital avatars from +only one portrait as the reference. Given a portrait, our method synthesizes a +coarse talking head video using driving keypoints features. And with the coarse +video, our method synthesizes a coarse talking head avatar with a deforming +neural radiance field. With rendered images of the coarse avatar, our method +updates the low-quality images with a blind face restoration model. With +updated images, we retrain the avatar for higher quality. After several +iterations, our method can synthesize a photo-realistic animatable 3D neural +head avatar. The motivation of our method is deformable neural radiance field +can eliminate the unnatural distortion caused by the image2video method. Our +method outperforms state-of-the-art methods in quantitative and qualitative +studies on various subjects. + +
+
+ comment: code: https://github.com/lsx0101/OPHAvatars +
+
+
+
+
+ + ♻ ☆ AirNet: Neural Network Transmission over the Air + + +
+ State-of-the-art performance for many edge applications is achieved by deep +neural networks (DNNs). Often, these DNNs are location- and time-sensitive, and +must be delivered over a wireless channel rapidly and efficiently. In this +paper, we introduce AirNet, a family of novel training and transmission methods +that allow DNNs to be efficiently delivered over wireless channels under +stringent transmit power and latency constraints. This corresponds to a new +class of joint source-channel coding problems, aimed at delivering DNNs with +the goal of maximizing their accuracy at the receiver, rather than recovering +them with high fidelity. In AirNet, we propose the direct mapping of the DNN +parameters to transmitted channel symbols, while the network is trained to meet +the channel constraints, and exhibit robustness against channel noise. AirNet +achieves higher accuracy compared to separation-based alternatives. We further +improve the performance of AirNet by pruning the network below the available +bandwidth, and expanding it for improved robustness. We also benefit from +unequal error protection by selectively expanding important layers of the +network. Finally, we develop an approach, which simultaneously trains a +spectrum of DNNs, each targeting a different channel condition, resolving the +impractical memory requirements of training distinct networks for different +channel conditions. + +
+
+
+
+
+ + ♻ ☆ Fairness in AI and Its Long-Term Implications on Society + + +
+ Successful deployment of artificial intelligence (AI) in various settings has +led to numerous positive outcomes for individuals and society. However, AI +systems have also been shown to harm parts of the population due to biased +predictions. AI fairness focuses on mitigating such biases to ensure AI +decision making is not discriminatory towards certain groups. We take a closer +look at AI fairness and analyze how lack of AI fairness can lead to deepening +of biases over time and act as a social stressor. More specifically, we discuss +how biased models can lead to more negative real-world outcomes for certain +groups, which may then become more prevalent by deploying new AI models trained +on increasingly biased data, resulting in a feedback loop. If the issues +persist, they could be reinforced by interactions with other risks and have +severe implications on society in the form of social unrest. We examine current +strategies for improving AI fairness, assess their limitations in terms of +real-world deployment, and explore potential paths forward to ensure we reap +AI's benefits without causing society's collapse. + +
+
+ comment: Stanford Existential Risks Conference 2023 +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ UniMatch: A Unified User-Item Matching Framework for the Multi-purpose + Merchant Marketing + + +
+ When doing private domain marketing with cloud services, the merchants +usually have to purchase different machine learning models for the multiple +marketing purposes, leading to a very high cost. We present a unified user-item +matching framework to simultaneously conduct item recommendation and user +targeting with just one model. We empirically demonstrate that the above +concurrent modeling is viable via modeling the user-item interaction matrix +with the multinomial distribution, and propose a bidirectional bias-corrected +NCE loss for the implementation. The proposed loss function guides the model to +learn the user-item joint probability $p(u,i)$ instead of the conditional +probability $p(i|u)$ or $p(u|i)$ through correcting both the users and items' +biases caused by the in-batch negative sampling. In addition, our framework is +model-agnostic enabling a flexible adaptation of different model architectures. +Extensive experiments demonstrate that our framework results in significant +performance gains in comparison with the state-of-the-art methods, with greatly +reduced cost on computing resources and daily maintenance. + +
+
+
+
+
+ + ☆ Our Model Achieves Excellent Performance on MovieLens: What Does it + Mean? + + +
+ A typical benchmark dataset for recommender system (RecSys) evaluation +consists of user-item interactions generated on a platform within a time +period. The interaction generation mechanism partially explains why a user +interacts with (e.g.,like, purchase, rate) an item, and the context of when a +particular interaction happened. In this study, we conduct a meticulous +analysis on the MovieLens dataset and explain the potential impact on using the +dataset for evaluating recommendation algorithms. We make a few main findings +from our analysis. First, there are significant differences in user +interactions at the different stages when a user interacts with the MovieLens +platform. The early interactions largely define the user portrait which affect +the subsequent interactions. Second, user interactions are highly affected by +the candidate movies that are recommended by the platform's internal +recommendation algorithm(s). Removal of interactions that happen nearer to the +last few interactions of a user leads to increasing difficulty in learning user +preference, thus deteriorating recommendation accuracy. Third, changing the +order of user interactions makes it more difficult for sequential algorithms to +capture the progressive interaction process. Based on these findings, we +further discuss the discrepancy between the interaction generation mechanism +that is employed by the MovieLens system and that of typical real world +recommendation scenarios. In summary, models that achieve excellent +recommendation accuracy on the MovieLens dataset may not demonstrate superior +performance in practice for at least two kinds of differences: (i) the +differences in the contexts of user-item interaction generation, and (ii) the +differences in user knowledge about the item collections. + +
+
+
+
+
+ + ☆ Who Provides the Largest Megaphone? The Role of Google News in Promoting + Russian State-Affiliated News Sources + + +
+ The Internet has not only digitized but also democratized information access +across the globe. This gradual but path-breaking move to online information +propagation has resulted in search engines playing an increasingly prominent +role in shaping access to human knowledge. When an Internet user enters a +query, the search engine sorts through the hundreds of billions of possible +webpages to determine what to show. Google dominates the search engine market, +with Google Search surpassing 80% market share globally every year of the last +decade. Only in Russia and China do Google competitors claim more market share, +with approximately 60% of Internet users in Russia preferring Yandex (compared +to 40% in favor of Google) and more than 80% of China's Internet users +accessing Baidu as of 2022. Notwithstanding this long-standing regional +variation in Internet search providers, there is limited research showing how +these providers compare in terms of propagating state-sponsored information. +Our study fills this research gap by focusing on Russian cyberspace and +examining how Google and Yandex's search algorithms rank content from Russian +state-controlled media (hereon, RSM) outlets. This question is timely and of +practical interest given widespread reports indicating that RSM outlets have +actively engaged in promoting Kremlin propaganda in the lead-up to, and in the +aftermath of, the Russian invasion of Ukraine in February 2022. + +
+
+
+
+
+ + ☆ DisCover: Disentangled Music Representation Learning for Cover Song + Identification + + +
+ In the field of music information retrieval (MIR), cover song identification +(CSI) is a challenging task that aims to identify cover versions of a query +song from a massive collection. Existing works still suffer from high +intra-song variances and inter-song correlations, due to the entangled nature +of version-specific and version-invariant factors in their modeling. In this +work, we set the goal of disentangling version-specific and version-invariant +factors, which could make it easier for the model to learn invariant music +representations for unseen query songs. We analyze the CSI task in a +disentanglement view with the causal graph technique, and identify the +intra-version and inter-version effects biasing the invariant learning. To +block these effects, we propose the disentangled music representation learning +framework (DisCover) for CSI. DisCover consists of two critical components: (1) +Knowledge-guided Disentanglement Module (KDM) and (2) Gradient-based +Adversarial Disentanglement Module (GADM), which block intra-version and +inter-version biased effects, respectively. KDM minimizes the mutual +information between the learned representations and version-variant factors +that are identified with prior domain knowledge. GADM identifies +version-variant factors by simulating the representation transitions between +intra-song versions, and exploits adversarial distillation for effect blocking. +Extensive comparisons with best-performing methods and in-depth analysis +demonstrate the effectiveness of DisCover and the and necessity of +disentanglement for CSI. + +
+
+
+
+
+ + ☆ Information Retrieval Meets Large Language Models: A Strategic Report + from Chinese IR Community + + +
+ The research field of Information Retrieval (IR) has evolved significantly, +expanding beyond traditional search to meet diverse user information needs. +Recently, Large Language Models (LLMs) have demonstrated exceptional +capabilities in text understanding, generation, and knowledge inference, +opening up exciting avenues for IR research. LLMs not only facilitate +generative retrieval but also offer improved solutions for user understanding, +model evaluation, and user-system interactions. More importantly, the +synergistic relationship among IR models, LLMs, and humans forms a new +technical paradigm that is more powerful for information seeking. IR models +provide real-time and relevant information, LLMs contribute internal knowledge, +and humans play a central role of demanders and evaluators to the reliability +of information services. Nevertheless, significant challenges exist, including +computational costs, credibility concerns, domain-specific limitations, and +ethical considerations. To thoroughly discuss the transformative impact of LLMs +on IR research, the Chinese IR community conducted a strategic workshop in +April 2023, yielding valuable insights. This paper provides a summary of the +workshop's outcomes, including the rethinking of IR's core values, the mutual +enhancement of LLMs and IR, the proposal of a novel IR technical paradigm, and +open challenges. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Amazon-M2: A Multilingual Multi-locale Shopping Session Dataset for + Recommendation and Text Generation KDD + + +
+ Modeling customer shopping intentions is a crucial task for e-commerce, as it +directly impacts user experience and engagement. Thus, accurately understanding +customer preferences is essential for providing personalized recommendations. +Session-based recommendation, which utilizes customer session data to predict +their next interaction, has become increasingly popular. However, existing +session datasets have limitations in terms of item attributes, user diversity, +and dataset scale. As a result, they cannot comprehensively capture the +spectrum of user behaviors and preferences. To bridge this gap, we present the +Amazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It +is the first multilingual dataset consisting of millions of user sessions from +six different locales, where the major languages of products are English, +German, Japanese, French, Italian, and Spanish. Remarkably, the dataset can +help us enhance personalization and understanding of user preferences, which +can benefit various existing tasks as well as enable new tasks. To test the +potential of the dataset, we introduce three tasks in this work: (1) +next-product recommendation, (2) next-product recommendation with domain +shifts, and (3) next-product title generation. With the above tasks, we +benchmark a range of algorithms on our proposed dataset, drawing new insights +for further research and practice. In addition, based on the proposed dataset +and tasks, we hosted a competition in the KDD CUP 2023 and have attracted +thousands of users and submissions. The winning solutions and the associated +workshop can be accessed at our website https://kddcup23.github.io/. + +
+
+ comment: Dataset for KDD Cup 2023, https://kddcup23.github.io/ +
+
+
+
+
+ + ☆ SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot + Neural Sparse Retrieval SIGIR 2023 + + +
+ Traditionally, sparse retrieval systems relied on lexical representations to +retrieve documents, such as BM25, dominated information retrieval tasks. With +the onset of pre-trained transformer models such as BERT, neural sparse +retrieval has led to a new paradigm within retrieval. Despite the success, +there has been limited software supporting different sparse retrievers running +in a unified, common environment. This hinders practitioners from fairly +comparing different sparse models and obtaining realistic evaluation results. +Another missing piece is, that a majority of prior work evaluates sparse +retrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO. +However, a key requirement in practical retrieval systems requires models that +can generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In +this work, we provide SPRINT, a unified Python toolkit based on Pyserini and +Lucene, supporting a common interface for evaluating neural sparse retrieval. +The toolkit currently includes five built-in models: uniCOIL, DeepImpact, +SPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by +defining their term weighting method. Using our toolkit, we establish strong +and reproducible zero-shot sparse retrieval baselines across the +well-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2 +achieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural +sparse retrievers. In this work, we further uncover the reasons behind its +performance gain. We show that SPLADEv2 produces sparse representations with a +majority of tokens outside of the original query and document which is often +crucial for its performance gains, i.e. a limitation among its other sparse +counterparts. We provide our SPRINT toolkit, models, and data used in our +experiments publicly here at https://github.com/thakur-nandan/sprint. + +
+
+ comment: Accepted at SIGIR 2023 (Resource Track) +
+
+
+
+
+ + ☆ Fast Approximate Nearest Neighbor Search with a Dynamic Exploration + Graph using Continuous Refinement + + +
+ For approximate nearest neighbor search, graph-based algorithms have shown to +offer the best trade-off between accuracy and search time. We propose the +Dynamic Exploration Graph (DEG) which significantly outperforms existing +algorithms in terms of search and exploration efficiency by combining two new +ideas: First, a single undirected even regular graph is incrementally built by +partially replacing existing edges to integrate new vertices and to update old +neighborhoods at the same time. Secondly, an edge optimization algorithm is +used to continuously improve the quality of the graph. Combining this ongoing +refinement with the graph construction process leads to a well-organized graph +structure at all times, resulting in: (1) increased search efficiency, (2) +predictable index size, (3) guaranteed connectivity and therefore reachability +of all vertices, and (4) a dynamic graph structure. In addition we investigate +how well existing graph-based search systems can handle indexed queries where +the seed vertex of a search is the query itself. Such exploration tasks, +despite their good starting point, are not necessarily easy. High efficiency in +approximate nearest neighbor search (ANNS) does not automatically imply good +performance in exploratory search. Extensive experiments show that our new +Dynamic Exploration Graph outperforms existing algorithms significantly for +indexed and unindexed queries. + +
+
+
+
+
+ + ☆ Classification of Visualization Types and Perspectives in Patents + + +
+ Due to the swift growth of patent applications each year, information and +multimedia retrieval approaches that facilitate patent exploration and +retrieval are of utmost importance. Different types of visualizations (e.g., +graphs, technical drawings) and perspectives (e.g., side view, perspective) are +used to visualize details of innovations in patents. The classification of +these images enables a more efficient search and allows for further analysis. +So far, datasets for image type classification miss some important +visualization types for patents. Furthermore, related work does not make use of +recent deep learning approaches including transformers. In this paper, we adopt +state-of-the-art deep learning methods for the classification of visualization +types and perspectives in patent images. We extend the CLEF-IP dataset for +image type classification in patents to ten classes and provide manual ground +truth annotations. In addition, we derive a set of hierarchical classes from a +dataset that provides weakly-labeled data for image perspectives. Experimental +results have demonstrated the feasibility of the proposed approaches. Source +code, models, and dataset will be made publicly available. + +
+
+ comment: Accepted in International Conference on Theory and Practice of + Digital Libraries (TPDL) 2023 (They have the copyright to publish + camera-ready version of this work) +
+
+
+
+
+ + ☆ IncDSI: Incrementally Updatable Document Retrieval + + +
+ Differentiable Search Index is a recently proposed paradigm for document +retrieval, that encodes information about a corpus of documents within the +parameters of a neural network and directly maps queries to corresponding +documents. These models have achieved state-of-the-art performances for +document retrieval across many benchmarks. These kinds of models have a +significant limitation: it is not easy to add new documents after a model is +trained. We propose IncDSI, a method to add documents in real time (about +20-50ms per document), without retraining the model on the entire dataset (or +even parts thereof). Instead we formulate the addition of documents as a +constrained optimization problem that makes minimal changes to the network +parameters. Although orders of magnitude faster, our approach is competitive +with re-training the model on the whole dataset and enables the development of +document retrieval systems that can be updated with new information in +real-time. Our code for IncDSI is available at +https://github.com/varshakishore/IncDSI. + +
+
+
+
+
+ + ☆ Mood Classification of Bangla Songs Based on Lyrics + + +
+ Music can evoke various emotions, and with the advancement of technology, it +has become more accessible to people. Bangla music, which portrays different +human emotions, lacks sufficient research. The authors of this article aim to +analyze Bangla songs and classify their moods based on the lyrics. To achieve +this, this research has compiled a dataset of 4000 Bangla song lyrics, genres, +and used Natural Language Processing and the Bert Algorithm to analyze the +data. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362 +for the romantic mood, 886 for happiness, and the rest 239 are classified as +relaxation. By embedding the lyrics of the songs, the authors have classified +the songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is +crucial as it enables a multi-class classification of songs' moods, making the +music more relatable to people's emotions. The article presents the automated +result of the four moods accurately derived from the song lyrics. + +
+
+ comment: Presented at International Conference on. Inventive Communication and + Computational Technologies 2023 +
+
+
+
+
+ + ♻ ☆ Trustworthy Recommender Systems + + +
+ Recommender systems (RSs) aim to help users to effectively retrieve items of +their interests from a large catalogue. For a quite long period of time, +researchers and practitioners have been focusing on developing accurate RSs. +Recent years have witnessed an increasing number of threats to RSs, coming from +attacks, system and user generated noise, system bias. As a result, it has +become clear that a strict focus on RS accuracy is limited and the research +must consider other important factors, e.g., trustworthiness. For end users, a +trustworthy RS (TRS) should not only be accurate, but also transparent, +unbiased and fair as well as robust to noise or attacks. These observations +actually led to a paradigm shift of the research on RSs: from accuracy-oriented +RSs to TRSs. However, researchers lack a systematic overview and discussion of +the literature in this novel and fast developing field of TRSs. To this end, in +this paper, we provide an overview of TRSs, including a discussion of the +motivation and basic concepts of TRSs, a presentation of the challenges in +building TRSs, and a perspective on the future directions in this area. We also +provide a novel conceptual framework to support the construction of TRSs. + +
+
+
+
+
+ + ♻ ☆ Injecting Domain Adaptation with Learning-to-hash for Effective and + Efficient Zero-shot Dense Retrieval + + +
+ Dense retrieval overcome the lexical gap and has shown great success in +ad-hoc information retrieval (IR). Despite their success, dense retrievers are +expensive to serve across practical use cases. For use cases requiring to +search from millions of documents, the dense index becomes bulky and requires +high memory usage for storing the index. More recently, learning-to-hash (LTH) +techniques, for e.g., BPR and JPQ, produce binary document vectors, thereby +reducing the memory requirement to efficiently store the dense index. LTH +techniques are supervised and finetune the retriever using a ranking loss. They +outperform their counterparts, i.e., traditional out-of-the-box vector +compression techniques such as PCA or PQ. A missing piece from prior work is +that existing techniques have been evaluated only in-domain, i.e., on a single +dataset such as MS MARCO. In our work, we evaluate LTH and vector compression +techniques for improving the downstream zero-shot retrieval accuracy of the +TAS-B dense retriever while maintaining efficiency at inference. Our results +demonstrate that, unlike prior work, LTH strategies when applied naively can +underperform the zero-shot TAS-B dense retriever on average by up to 14% +nDCG@10 on the BEIR benchmark. To solve this limitation, in our work, we +propose an easy yet effective solution of injecting domain adaptation with +existing supervised LTH techniques. We experiment with two well-known +unsupervised domain adaptation techniques: GenQ and GPL. Our domain adaptation +injection technique can improve the downstream zero-shot retrieval +effectiveness for both BPR and JPQ variants of the TAS-B model by on average +11.5% and 8.2% nDCG@10 while both maintaining 32$\times$ memory efficiency and +14$\times$ and 2$\times$ speedup respectively in CPU retrieval latency on BEIR. +All our code, models, and data are publicly available at +https://github.com/thakur-nandan/income. + +
+
+ comment: Accepted at ReNeuIR 2023 Workshop +
+
+
+
+
+ + ♻ ☆ Deep Exploration for Recommendation Systems + + +
+ Modern recommendation systems ought to benefit by probing for and learning +from delayed feedback. Research has tended to focus on learning from a user's +response to a single recommendation. Such work, which leverages methods of +supervised and bandit learning, forgoes learning from the user's subsequent +behavior. Where past work has aimed to learn from subsequent behavior, there +has been a lack of effective methods for probing to elicit informative delayed +feedback. Effective exploration through probing for delayed feedback becomes +particularly challenging when rewards are sparse. To address this, we develop +deep exploration methods for recommendation systems. In particular, we +formulate recommendation as a sequential decision problem and demonstrate +benefits of deep exploration over single-step exploration. Our experiments are +carried out with high-fidelity industrial-grade simulators and establish large +improvements over existing algorithms. + +
+
+
+
+
+ + ♻ ☆ Improving Text Matching in E-Commerce Search with A Rationalizable, + Intervenable and Fast Entity-Based Relevance Model + + +
+ Discovering the intended items of user queries from a massive repository of +items is one of the main goals of an e-commerce search system. Relevance +prediction is essential to the search system since it helps improve +performance. When online serving a relevance model, the model is required to +perform fast and accurate inference. Currently, the widely used models such as +Bi-encoder and Cross-encoder have their limitations in accuracy or inference +speed respectively. In this work, we propose a novel model called the +Entity-Based Relevance Model (EBRM). We identify the entities contained in an +item and decompose the QI (query-item) relevance problem into multiple QE +(query-entity) relevance problems; we then aggregate their results to form the +QI prediction using a soft logic formulation. The decomposition allows us to +use a Cross-encoder QE relevance module for high accuracy as well as cache QE +predictions for fast online inference. Utilizing soft logic makes the +prediction procedure interpretable and intervenable. We also show that +pretraining the QE module with auto-generated QE data from user logs can +further improve the overall performance. The proposed method is evaluated on +labeled data from e-commerce websites. Empirical results show that it achieves +promising improvements with computation efficiency. + +
+
+
+
+
+
+
+
+ + Machine Learning 116 + +
+
+
+ + ☆ LightPath: Lightweight and Scalable Path Representation Learning KDD-23 + + +
+ Movement paths are used widely in intelligent transportation and smart city +applications. To serve such applications, path representation learning aims to +provide compact representations of paths that enable efficient and accurate +operations when used for different downstream tasks such as path ranking and +travel cost estimation. In many cases, it is attractive that the path +representation learning is lightweight and scalable; in resource-limited +environments and under green computing limitations, it is essential. Yet, +existing path representation learning studies focus on accuracy and pay at most +secondary attention to resource consumption and scalability. + We propose a lightweight and scalable path representation learning framework, +termed LightPath, that aims to reduce resource consumption and achieve +scalability without affecting accuracy, thus enabling broader applicability. +More specifically, we first propose a sparse auto-encoder that ensures that the +framework achieves good scalability with respect to path length. Next, we +propose a relational reasoning framework to enable faster training of more +robust sparse path encoders. We also propose global-local knowledge +distillation to further reduce the size and improve the performance of sparse +path encoders. Finally, we report extensive experiments on two real-world +datasets to offer insight into the efficiency, scalability, and effectiveness +of the proposed framework. + +
+
+ comment: This paper has been accepted by ACM SIGKDD-23 +
+
+
+
+
+ + ☆ Challenges and Applications of Large Language Models + + +
+ Large Language Models (LLMs) went from non-existent to ubiquitous in the +machine learning discourse within a few years. Due to the fast pace of the +field, it is difficult to identify the remaining challenges and already +fruitful application areas. In this paper, we aim to establish a systematic set +of open problems and application successes so that ML researchers can +comprehend the field's current state more quickly and become productive. + +
+
+ comment: 72 pages. v01. Work in progress. Feedback and comments are highly + appreciated! +
+
+
+
+
+ + ☆ VITS : Variational Inference Thomson Sampling for contextual bandits + + +
+ In this paper, we introduce and analyze a variant of the Thompson sampling +(TS) algorithm for contextual bandits. At each round, traditional TS requires +samples from the current posterior distribution, which is usually intractable. +To circumvent this issue, approximate inference techniques can be used and +provide samples with distribution close to the posteriors. However, current +approximate techniques yield to either poor estimation (Laplace approximation) +or can be computationally expensive (MCMC methods, Ensemble sampling...). In +this paper, we propose a new algorithm, Varational Inference Thompson sampling +VITS, based on Gaussian Variational Inference. This scheme provides powerful +posterior approximations which are easy to sample from, and is computationally +efficient, making it an ideal choice for TS. In addition, we show that VITS +achieves a sub-linear regret bound of the same order in the dimension and +number of round as traditional TS for linear contextual bandit. Finally, we +demonstrate experimentally the effectiveness of VITS on both synthetic and real +world datasets. + +
+
+
+
+
+ + ☆ Rethinking Backdoor Attacks ICML 2023 + + +
+ In a backdoor attack, an adversary inserts maliciously constructed backdoor +examples into a training set to make the resulting model vulnerable to +manipulation. Defending against such attacks typically involves viewing these +inserted examples as outliers in the training set and using techniques from +robust statistics to detect and remove them. + In this work, we present a different approach to the backdoor attack problem. +Specifically, we show that without structural information about the training +data distribution, backdoor attacks are indistinguishable from +naturally-occurring features in the data--and thus impossible to "detect" in a +general sense. Then, guided by this observation, we revisit existing defenses +against backdoor attacks and characterize the (often latent) assumptions they +make and on which they depend. Finally, we explore an alternative perspective +on backdoor attacks: one that assumes these attacks correspond to the strongest +feature in the training data. Under this assumption (which we make formal) we +develop a new primitive for detecting backdoor attacks. Our primitive naturally +gives rise to a detection algorithm that comes with theoretical guarantees and +is effective in practice. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ☆ Robust Driving Policy Learning with Guided Meta Reinforcement Learning SC 2023 + + +
+ Although deep reinforcement learning (DRL) has shown promising results for +autonomous navigation in interactive traffic scenarios, existing work typically +adopts a fixed behavior policy to control social vehicles in the training +environment. This may cause the learned driving policy to overfit the +environment, making it difficult to interact well with vehicles with different, +unseen behaviors. In this work, we introduce an efficient method to train +diverse driving policies for social vehicles as a single meta-policy. By +randomizing the interaction-based reward functions of social vehicles, we can +generate diverse objectives and efficiently train the meta-policy through +guiding policies that achieve specific objectives. We further propose a +training strategy to enhance the robustness of the ego vehicle's driving policy +using the environment where social vehicles are controlled by the learned +meta-policy. Our method successfully learns an ego driving policy that +generalizes well to unseen situations with out-of-distribution (OOD) social +agents' behaviors in a challenging uncontrolled T-intersection scenario. + +
+
+ comment: ITSC 2023 +
+
+
+
+
+ + ☆ Curvature-based Clustering on Graphs + + +
+ Unsupervised node clustering (or community detection) is a classical graph +learning task. In this paper, we study algorithms, which exploit the geometry +of the graph to identify densely connected substructures, which form clusters +or communities. Our method implements discrete Ricci curvatures and their +associated geometric flows, under which the edge weights of the graph evolve to +reveal its community structure. We consider several discrete curvature notions +and analyze the utility of the resulting algorithms. In contrast to prior +literature, we study not only single-membership community detection, where each +node belongs to exactly one community, but also mixed-membership community +detection, where communities may overlap. For the latter, we argue that it is +beneficial to perform community detection on the line graph, i.e., the graph's +dual. We provide both theoretical and empirical evidence for the utility of our +curvature-based clustering algorithms. In addition, we give several results on +the relationship between the curvature of a graph and that of its dual, which +enable the efficient implementation of our proposed mixed-membership community +detection approach and which may be of independent interest for curvature-based +network analysis. + +
+
+ comment: 65 pages, 19 figures +
+
+
+
+
+ + ☆ Benchmarking Potential Based Rewards for Learning Humanoid Locomotion + + +
+ The main challenge in developing effective reinforcement learning (RL) +pipelines is often the design and tuning the reward functions. Well-designed +shaping reward can lead to significantly faster learning. Naively formulated +rewards, however, can conflict with the desired behavior and result in +overfitting or even erratic performance if not properly tuned. In theory, the +broad class of potential based reward shaping (PBRS) can help guide the +learning process without affecting the optimal policy. Although several studies +have explored the use of potential based reward shaping to accelerate learning +convergence, most have been limited to grid-worlds and low-dimensional systems, +and RL in robotics has predominantly relied on standard forms of reward +shaping. In this paper, we benchmark standard forms of shaping with PBRS for a +humanoid robot. We find that in this high-dimensional system, PBRS has only +marginal benefits in convergence speed. However, the PBRS reward terms are +significantly more robust to scaling than typical reward shaping approaches, +and thus easier to tune. + +
+
+
+
+
+ + ☆ Gradient Sparsification For Masked Fine-Tuning of Transformers IJCNN 2023 + + +
+ Fine-tuning pretrained self-supervised language models is widely adopted for +transfer learning to downstream tasks. Fine-tuning can be achieved by freezing +gradients of the pretrained network and only updating gradients of a newly +added classification layer, or by performing gradient updates on all +parameters. Gradual unfreezing makes a trade-off between the two by gradually +unfreezing gradients of whole layers during training. This has been an +effective strategy to trade-off between storage and training speed with +generalization performance. However, it is not clear whether gradually +unfreezing layers throughout training is optimal, compared to sparse variants +of gradual unfreezing which may improve fine-tuning performance. In this paper, +we propose to stochastically mask gradients to regularize pretrained language +models for improving overall fine-tuned performance. We introduce GradDrop and +variants thereof, a class of gradient sparsification methods that mask +gradients during the backward pass, acting as gradient noise. GradDrop is +sparse and stochastic unlike gradual freezing. Extensive experiments on the +multilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive +against methods that use additional translated data for intermediate +pretraining and outperforms standard fine-tuning and gradual unfreezing. A +post-analysis shows how GradDrop improves performance with languages it was not +trained on, such as under-resourced languages. + +
+
+ comment: Accepted to IJCNN 2023 +
+
+
+
+
+ + ☆ Revisiting invariances and introducing priors in Gromov-Wasserstein + distances + + +
+ Gromov-Wasserstein distance has found many applications in machine learning +due to its ability to compare measures across metric spaces and its invariance +to isometric transformations. However, in certain applications, this invariance +property can be too flexible, thus undesirable. Moreover, the +Gromov-Wasserstein distance solely considers pairwise sample similarities in +input datasets, disregarding the raw feature representations. We propose a new +optimal transport-based distance, called Augmented Gromov-Wasserstein, that +allows for some control over the level of rigidity to transformations. It also +incorporates feature alignments, enabling us to better leverage prior knowledge +on the input data for improved performance. We present theoretical insights +into the proposed metric. We then demonstrate its usefulness for single-cell +multi-omic alignment tasks and a transfer learning scenario in machine +learning. + +
+
+
+
+
+ + ☆ Android in the Wild: A Large-Scale Dataset for Android Device Control + + +
+ There is a growing interest in device-control systems that can interpret +human natural language instructions and execute them on a digital device by +directly controlling its user interface. We present a dataset for +device-control research, Android in the Wild (AITW), which is orders of +magnitude larger than current datasets. The dataset contains human +demonstrations of device interactions, including the screens and actions, and +corresponding natural language instructions. It consists of 715k episodes +spanning 30k unique instructions, four versions of Android (v10-13),and eight +device types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It +contains multi-step tasks that require semantic understanding of language and +visual context. This dataset poses a new challenge: actions available through +the user interface must be inferred from their visual appearance. And, instead +of simple UI element-based actions, the action space consists of precise +gestures (e.g., horizontal scrolls to operate carousel widgets). We organize +our dataset to encourage robustness analysis of device-control systems, i.e., +how well a system performs in the presence of new task descriptions, new +applications, or new platform versions. We develop two agents and report +performance across the dataset. The dataset is available at +https://github.com/google-research/google-research/tree/master/android_in_the_wild. + +
+
+
+
+
+ + ☆ A Dual Formulation for Probabilistic Principal Component Analysis ICML 2023 + + +
+ In this paper, we characterize Probabilistic Principal Component Analysis in +Hilbert spaces and demonstrate how the optimal solution admits a representation +in dual space. This allows us to develop a generative framework for kernel +methods. Furthermore, we show how it englobes Kernel Principal Component +Analysis and illustrate its working on a toy and a real dataset. + +
+
+ comment: ICML 2023 Workshop on Duality for Modern Machine Learning (DP4ML). 14 + pages (8 main + 5 appendix), 4 figures and 4 tables +
+
+
+
+
+ + ☆ Unsupervised Accuracy Estimation of Deep Visual Models using + Domain-Adaptive Adversarial Perturbation without Source Samples ICCV 2023 + + +
+ Deploying deep visual models can lead to performance drops due to the +discrepancies between source and target distributions. Several approaches +leverage labeled source data to estimate target domain accuracy, but accessing +labeled source data is often prohibitively difficult due to data +confidentiality or resource limitations on serving devices. Our work proposes a +new framework to estimate model accuracy on unlabeled target data without +access to source data. We investigate the feasibility of using pseudo-labels +for accuracy estimation and evolve this idea into adopting recent advances in +source-free domain adaptation algorithms. Our approach measures the +disagreement rate between the source hypothesis and the target pseudo-labeling +function, adapted from the source hypothesis. We mitigate the impact of +erroneous pseudo-labels that may arise due to a high ideal joint hypothesis +risk by employing adaptive adversarial perturbation on the input of the target +model. Our proposed source-free framework effectively addresses the challenging +distribution shift scenarios and outperforms existing methods requiring source +data and labels for training. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Accurate deep learning sub-grid scale models for large eddy simulations + + +
+ We present two families of sub-grid scale (SGS) turbulence models developed +for large-eddy simulation (LES) purposes. Their development required the +formulation of physics-informed robust and efficient Deep Learning (DL) +algorithms which, unlike state-of-the-art analytical modeling techniques can +produce high-order complex non-linear relations between inputs and outputs. +Explicit filtering of data from direct simulations of the canonical channel +flow at two friction Reynolds numbers $Re_\tau\approx 395$ and 590 provided +accurate data for training and testing. The two sets of models use different +network architectures. One of the architectures uses tensor basis neural +networks (TBNN) and embeds the simplified analytical model form of the general +effective-viscosity hypothesis, thus incorporating the Galilean, rotational and +reflectional invariances. The other architecture is that of a relatively simple +network, that is able to incorporate the Galilean invariance only. However, +this simpler architecture has better feature extraction capacity owing to its +ability to establish relations between and extract information from +cross-components of the integrity basis tensors and the SGS stresses. Both sets +of models are used to predict the SGS stresses for feature datasets generated +with different filter widths, and at different Reynolds numbers. It is shown +that due to the simpler model's better feature learning capabilities, it +outperforms the invariance embedded model in statistical performance metrics. +In a priori tests, both sets of models provide similar levels of dissipation +and backscatter. Based on the test results, both sets of models should be +usable in a posteriori actual LESs. + +
+
+
+
+
+ + ☆ Convergence Guarantees for Stochastic Subgradient Methods in Nonsmooth + Nonconvex Optimization + + +
+ In this paper, we investigate the convergence properties of the stochastic +gradient descent (SGD) method and its variants, especially in training neural +networks built from nonsmooth activation functions. We develop a novel +framework that assigns different timescales to stepsizes for updating the +momentum terms and variables, respectively. Under mild conditions, we prove the +global convergence of our proposed framework in both single-timescale and +two-timescale cases. We show that our proposed framework encompasses a wide +range of well-known SGD-type methods, including heavy-ball SGD, SignSGD, Lion, +normalized SGD and clipped SGD. Furthermore, when the objective function adopts +a finite-sum formulation, we prove the convergence properties for these +SGD-type methods based on our proposed framework. In particular, we prove that +these SGD-type methods find the Clarke stationary points of the objective +function with randomly chosen stepsizes and initial points under mild +assumptions. Preliminary numerical experiments demonstrate the high efficiency +of our analyzed SGD-type methods. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ Contextual Reliability: When Different Features Matter in Different + Contexts ICML 2023 + + +
+ Deep neural networks often fail catastrophically by relying on spurious +correlations. Most prior work assumes a clear dichotomy into spurious and +reliable features; however, this is often unrealistic. For example, most of the +time we do not want an autonomous car to simply copy the speed of surrounding +cars -- we don't want our car to run a red light if a neighboring car does so. +However, we cannot simply enforce invariance to next-lane speed, since it could +provide valuable information about an unobservable pedestrian at a crosswalk. +Thus, universally ignoring features that are sometimes (but not always) +reliable can lead to non-robust performance. We formalize a new setting called +contextual reliability which accounts for the fact that the "right" features to +use may vary depending on the context. We propose and analyze a two-stage +framework called Explicit Non-spurious feature Prediction (ENP) which first +identifies the relevant features to use for a given context, then trains a +model to rely exclusively on these features. Our work theoretically and +empirically demonstrates the advantages of ENP over existing methods and +provides new benchmarks for contextual reliability. + +
+
+ comment: ICML 2023 Camera Ready Version +
+
+
+
+
+ + ☆ Europepolls: A Dataset of Country-Level Opinion Polling Data for the + European Union and the UK + + +
+ I propose an open dataset of country-level historical opinion polling data +for the European Union and the UK. The dataset aims to fill a gap in available +opinion polling data for the European Union. Some existing datasets are +restricted to the past five years, limiting research opportunities. At the same +time, some larger proprietary datasets exist but are available only in a visual +preprocessed time series format. Finally, while other large datasets for +individual countries might exist, these could be inaccessible due to language +barriers. The data was gathered from Wikipedia, and preprocessed using the +pandas library. Both the raw and the preprocessed data are in the .csv format. +I hope that given the recent advances in LLMs and deep learning in general, +this large dataset will enable researchers to uncover complex interactions +between multimodal data (news articles, economic indicators, social media) and +voting behavior. The raw data, the preprocessed data, and the preprocessing +scripts are available on GitHub. + +
+
+
+
+
+ + ☆ TbExplain: A Text-based Explanation Method for Scene Classification + Models with the Statistical Prediction Correction + + +
+ The field of Explainable Artificial Intelligence (XAI) aims to improve the +interpretability of black-box machine learning models. Building a heatmap based +on the importance value of input features is a popular method for explaining +the underlying functions of such models in producing their predictions. +Heatmaps are almost understandable to humans, yet they are not without flaws. +Non-expert users, for example, may not fully understand the logic of heatmaps +(the logic in which relevant pixels to the model's prediction are highlighted +with different intensities or colors). Additionally, objects and regions of the +input image that are relevant to the model prediction are frequently not +entirely differentiated by heatmaps. In this paper, we propose a framework +called TbExplain that employs XAI techniques and a pre-trained object detector +to present text-based explanations of scene classification models. Moreover, +TbExplain incorporates a novel method to correct predictions and textually +explain them based on the statistics of objects in the input image when the +initial prediction is unreliable. To assess the trustworthiness and validity of +the text-based explanations, we conducted a qualitative experiment, and the +findings indicated that these explanations are sufficiently reliable. +Furthermore, our quantitative and qualitative experiments on TbExplain with +scene classification datasets reveal an improvement in classification accuracy +over ResNet variants. + +
+
+
+
+
+ + ☆ Impact of Disentanglement on Pruning Neural Networks SC + + +
+ Deploying deep learning neural networks on edge devices, to accomplish task +specific objectives in the real-world, requires a reduction in their memory +footprint, power consumption, and latency. This can be realized via efficient +model compression. Disentangled latent representations produced by variational +autoencoder (VAE) networks are a promising approach for achieving model +compression because they mainly retain task-specific information, discarding +useless information for the task at hand. We make use of the Beta-VAE framework +combined with a standard criterion for pruning to investigate the impact of +forcing the network to learn disentangled representations on the pruning +process for the task of classification. In particular, we perform experiments +on MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose +a path forward for future works. + +
+
+ comment: Presented in ISCS23 +
+
+
+
+
+ + ☆ UniMatch: A Unified User-Item Matching Framework for the Multi-purpose + Merchant Marketing + + +
+ When doing private domain marketing with cloud services, the merchants +usually have to purchase different machine learning models for the multiple +marketing purposes, leading to a very high cost. We present a unified user-item +matching framework to simultaneously conduct item recommendation and user +targeting with just one model. We empirically demonstrate that the above +concurrent modeling is viable via modeling the user-item interaction matrix +with the multinomial distribution, and propose a bidirectional bias-corrected +NCE loss for the implementation. The proposed loss function guides the model to +learn the user-item joint probability $p(u,i)$ instead of the conditional +probability $p(i|u)$ or $p(u|i)$ through correcting both the users and items' +biases caused by the in-batch negative sampling. In addition, our framework is +model-agnostic enabling a flexible adaptation of different model architectures. +Extensive experiments demonstrate that our framework results in significant +performance gains in comparison with the state-of-the-art methods, with greatly +reduced cost on computing resources and daily maintenance. + +
+
+
+
+
+ + ☆ TinyTrain: Deep Neural Network Training at the Extreme Edge + + +
+ On-device training is essential for user personalisation and privacy. With +the pervasiveness of IoT devices and microcontroller units (MCU), this task +becomes more challenging due to the constrained memory and compute resources, +and the limited availability of labelled user data. Nonetheless, prior works +neglect the data scarcity issue, require excessively long training time (e.g. a +few hours), or induce substantial accuracy loss ($\geq$10\%). We propose +TinyTrain, an on-device training approach that drastically reduces training +time by selectively updating parts of the model and explicitly coping with data +scarcity. TinyTrain introduces a task-adaptive sparse-update method that +dynamically selects the layer/channel based on a multi-objective criterion that +jointly captures user data, the memory, and the compute capabilities of the +target device, leading to high accuracy on unseen tasks with reduced +computation and memory footprint. TinyTrain outperforms vanilla fine-tuning of +the entire network by 3.6-5.0\% in accuracy, while reducing the backward-pass +memory and computation cost by up to 2,286$\times$ and 7.68$\times$, +respectively. Targeting broadly used real-world edge devices, TinyTrain +achieves 9.5$\times$ faster and 3.5$\times$ more energy-efficient training over +status-quo approaches, and 2.8$\times$ smaller memory footprint than SOTA +approaches, while remaining within the 1 MB memory envelope of MCU-grade +platforms. + +
+
+
+
+
+ + ☆ Learner Referral for Cost-Effective Federated Learning Over Hierarchical + IoT Networks + + +
+ The paradigm of federated learning (FL) to address data privacy concerns by +locally training parameters on resource-constrained clients in a distributed +manner has garnered significant attention. Nonetheless, FL is not applicable +when not all clients within the coverage of the FL server are registered with +the FL network. To bridge this gap, this paper proposes joint learner referral +aided federated client selection (LRef-FedCS), along with communications and +computing resource scheduling, and local model accuracy optimization (LMAO) +methods. These methods are designed to minimize the cost incurred by the +worst-case participant and ensure the long-term fairness of FL in hierarchical +Internet of Things (HieIoT) networks. Utilizing the Lyapunov optimization +technique, we reformulate the original problem into a stepwise joint +optimization problem (JOP). Subsequently, to tackle the mixed-integer +non-convex JOP, we separatively and iteratively address LRef-FedCS and LMAO +through the centralized method and self-adaptive global best harmony search +(SGHS) algorithm, respectively. To enhance scalability, we further propose a +distributed LRef-FedCS approach based on a matching game to replace the +centralized method described above. Numerical simulations and experimental +results on the MNIST/CIFAR-10 datasets demonstrate that our proposed LRef-FedCS +approach could achieve a good balance between pursuing high global accuracy and +reducing cost. + +
+
+
+
+
+ + ☆ Towards green AI-based software systems: an architecture-centric + approach (GAISSA) + + +
+ Nowadays, AI-based systems have achieved outstanding results and have +outperformed humans in different domains. However, the processes of training AI +models and inferring from them require high computational resources, which pose +a significant challenge in the current energy efficiency societal demand. To +cope with this challenge, this research project paper describes the main +vision, goals, and expected outcomes of the GAISSA project. The GAISSA project +aims at providing data scientists and software engineers tool-supported, +architecture-centric methods for the modelling and development of green +AI-based systems. Although the project is in an initial stage, we describe the +current research results, which illustrate the potential to achieve GAISSA +objectives. + +
+
+ comment: Accepted for publication as full paper - 2023 49th Euromicro + Conference Series on Software Engineering and Advanced Applications (SEAA) +
+
+
+
+
+ + ☆ XSkill: Cross Embodiment Skill Discovery + + +
+ Human demonstration videos are a widely available data source for robot +learning and an intuitive user interface for expressing desired behavior. +However, directly extracting reusable robot manipulation skills from +unstructured human videos is challenging due to the big embodiment difference +and unobserved action parameters. To bridge this embodiment gap, this paper +introduces XSkill, an imitation learning framework that 1) discovers a +cross-embodiment representation called skill prototypes purely from unlabeled +human and robot manipulation videos, 2) transfers the skill representation to +robot actions using conditional diffusion policy, and finally, 3) composes the +learned skill to accomplish unseen tasks specified by a human prompt video. Our +experiments in simulation and real-world environments show that the discovered +skill prototypes facilitate both skill transfer and composition for unseen +tasks, resulting in a more general and scalable imitation learning framework. +The performance of XSkill is best understood from the anonymous website: +https://xskillcorl.github.io. + +
+
+
+
+
+ + ☆ Impatient Bandits: Optimizing for the Long-Term Without Delay KDD + + +
+ Recommender systems are a ubiquitous feature of online platforms. +Increasingly, they are explicitly tasked with increasing users' long-term +satisfaction. In this context, we study a content exploration task, which we +formalize as a multi-armed bandit problem with delayed rewards. We observe that +there is an apparent trade-off in choosing the learning signal: Waiting for the +full reward to become available might take several weeks, hurting the rate at +which learning happens, whereas measuring short-term proxy rewards reflects the +actual long-term goal only imperfectly. We address this challenge in two steps. +First, we develop a predictive model of delayed rewards that incorporates all +information obtained to date. Full observations as well as partial (short or +medium-term) outcomes are combined through a Bayesian filter to obtain a +probabilistic belief. Second, we devise a bandit algorithm that takes advantage +of this new predictive model. The algorithm quickly learns to identify content +aligned with long-term success by carefully balancing exploration and +exploitation. We apply our approach to a podcast recommendation problem, where +we seek to identify shows that users engage with repeatedly over two months. We +empirically validate that our approach results in substantially better +performance compared to approaches that either optimize for short-term proxies, +or wait for the long-term outcome to be fully realized. + +
+
+ comment: Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery + and Data Mining (KDD '23) +
+
+
+
+
+ + ☆ TREEMENT: Interpretable Patient-Trial Matching via Personalized Dynamic + Tree-Based Memory Network + + +
+ Clinical trials are critical for drug development but often suffer from +expensive and inefficient patient recruitment. In recent years, machine +learning models have been proposed for speeding up patient recruitment via +automatically matching patients with clinical trials based on longitudinal +patient electronic health records (EHR) data and eligibility criteria of +clinical trials. However, they either depend on trial-specific expert rules +that cannot expand to other trials or perform matching at a very general level +with a black-box model where the lack of interpretability makes the model +results difficult to be adopted. + To provide accurate and interpretable patient trial matching, we introduce a +personalized dynamic tree-based memory network model named TREEMENT. It +utilizes hierarchical clinical ontologies to expand the personalized patient +representation learned from sequential EHR data, and then uses an attentional +beam-search query learned from eligibility criteria embedding to offer a +granular level of alignment for improved performance and interpretability. We +evaluated TREEMENT against existing models on real-world datasets and +demonstrated that TREEMENT outperforms the best baseline by 7% in terms of +error reduction in criteria-level matching and achieves state-of-the-art +results in its trial-level matching ability. Furthermore, we also show TREEMENT +can offer good interpretability to make the model results easier for adoption. + +
+
+
+
+
+ + ☆ Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to + Harness Spurious Features + + +
+ To avoid failures on out-of-distribution data, recent works have sought to +extract features that have a stable or invariant relationship with the label +across domains, discarding the "spurious" or unstable features whose +relationship with the label changes across domains. However, unstable features +often carry complementary information about the label that could boost +performance if used correctly in the test domain. Our main contribution is to +show that it is possible to learn how to use these unstable features in the +test domain without labels. In particular, we prove that pseudo-labels based on +stable features provide sufficient guidance for doing so, provided that stable +and unstable features are conditionally independent given the label. Based on +this theoretical insight, we propose Stable Feature Boosting (SFB), an +algorithm for: (i) learning a predictor that separates stable and +conditionally-independent unstable features; and (ii) using the stable-feature +predictions to adapt the unstable-feature predictions in the test domain. +Theoretically, we prove that SFB can learn an asymptotically-optimal predictor +without test-domain labels. Empirically, we demonstrate the effectiveness of +SFB on real and synthetic data. + +
+
+
+
+
+ + ☆ DISA: DIfferentiable Similarity Approximation for Universal Multimodal + Registration MICCAI 2023 + + +
+ Multimodal image registration is a challenging but essential step for +numerous image-guided procedures. Most registration algorithms rely on the +computation of complex, frequently non-differentiable similarity metrics to +deal with the appearance discrepancy of anatomical structures between imaging +modalities. Recent Machine Learning based approaches are limited to specific +anatomy-modality combinations and do not generalize to new settings. We propose +a generic framework for creating expressive cross-modal descriptors that enable +fast deformable global registration. We achieve this by approximating existing +metrics with a dot-product in the feature space of a small convolutional neural +network (CNN) which is inherently differentiable can be trained without +registered data. Our method is several orders of magnitude faster than local +patch-based metrics and can be directly applied in clinical settings by +replacing the similarity measure with the proposed one. Experiments on three +different datasets demonstrate that our approach generalizes well beyond the +training data, yielding a broad capture range even on unseen anatomies and +modality pairs, without the need for specialized retraining. We make our +training code and data publicly available. + +
+
+ comment: This preprint was submitted to MICCAI 2023. The Version of Record of + this contribution will be published in Springer LNCS +
+
+
+
+
+ + ☆ TimeTuner: Diagnosing Time Representations for Time-Series Forecasting + with Counterfactual Explanations + + +
+ Deep learning (DL) approaches are being increasingly used for time-series +forecasting, with many efforts devoted to designing complex DL models. Recent +studies have shown that the DL success is often attributed to effective data +representations, fostering the fields of feature engineering and representation +learning. However, automated approaches for feature learning are typically +limited with respect to incorporating prior knowledge, identifying interactions +among variables, and choosing evaluation metrics to ensure that the models are +reliable. To improve on these limitations, this paper contributes a novel +visual analytics framework, namely TimeTuner, designed to help analysts +understand how model behaviors are associated with localized correlations, +stationarity, and granularity of time-series representations. The system mainly +consists of the following two-stage technique: We first leverage counterfactual +explanations to connect the relationships among time-series representations, +multivariate features and model predictions. Next, we design multiple +coordinated views including a partition-based correlation matrix and juxtaposed +bivariate stripes, and provide a set of interactions that allow users to step +into the transformation selection process, navigate through the feature space, +and reason the model performance. We instantiate TimeTuner with two +transformation methods of smoothing and sampling, and demonstrate its +applicability on real-world time-series forecasting of univariate sunspots and +multivariate air pollutants. Feedback from domain experts indicates that our +system can help characterize time-series representations and guide the feature +engineering processes. + +
+
+ comment: 11 pages, 9 figures, this paper has been accepted by VIS2024 +
+
+
+
+
+ + ☆ Deep projection networks for learning time-homogeneous dynamical systems + + +
+ We consider the general class of time-homogeneous dynamical systems, both +discrete and continuous, and study the problem of learning a meaningful +representation of the state from observed data. This is instrumental for the +task of learning a forward transfer operator of the system, that in turn can be +used for forecasting future states or observables. The representation, +typically parametrized via a neural network, is associated with a projection +operator and is learned by optimizing an objective function akin to that of +canonical correlation analysis (CCA). However, unlike CCA, our objective avoids +matrix inversions and therefore is generally more stable and applicable to +challenging scenarios. Our objective is a tight relaxation of CCA and we +further enhance it by proposing two regularization schemes, one encouraging the +orthogonality of the components of the representation while the other +exploiting Chapman-Kolmogorov's equation. We apply our method to challenging +discrete dynamical systems, discussing improvements over previous methods, as +well as to continuous dynamical systems. + +
+
+
+
+
+ + ☆ Repeated Observations for Classification + + +
+ We study the problem nonparametric classification with repeated observations. +Let $\bX$ be the $d$ dimensional feature vector and let $Y$ denote the label +taking values in $\{1,\dots ,M\}$. In contrast to usual setup with large sample +size $n$ and relatively low dimension $d$, this paper deals with the situation, +when instead of observing a single feature vector $\bX$ we are given $t$ +repeated feature vectors $\bV_1,\dots ,\bV_t $. Some simple classification +rules are presented such that the conditional error probabilities have +exponential convergence rate of convergence as $t\to\infty$. In the analysis, +we investigate particular models like robust detection by nominal densities, +prototype classification, linear transformation, linear classification, +scaling. + +
+
+
+
+
+ + ☆ Symmetric Equilibrium Learning of VAEs + + +
+ We view variational autoencoders (VAE) as decoder-encoder pairs, which map +distributions in the data space to distributions in the latent space and vice +versa. The standard learning approach for VAEs, i.e. maximisation of the +evidence lower bound (ELBO), has an obvious asymmetry in that respect. +Moreover, it requires a closed form a-priori latent distribution. This limits +the applicability of VAEs in more complex scenarios, such as general +semi-supervised learning and employing complex generative models as priors. We +propose a Nash equilibrium learning approach that relaxes these restrictions +and allows learning VAEs in situations where both the data and the latent +distributions are accessible only by sampling. The flexibility and simplicity +of this approach allows its application to a wide range of learning scenarios +and downstream tasks. We show experimentally that the models learned by this +method are comparable to those obtained by ELBO learning and demonstrate its +applicability for tasks that are not accessible by standard VAE learning. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ Adversarial Likelihood Estimation with One-way Flows + + +
+ Generative Adversarial Networks (GANs) can produce high-quality samples, but +do not provide an estimate of the probability density around the samples. +However, it has been noted that maximizing the log-likelihood within an +energy-based setting can lead to an adversarial framework where the +discriminator provides unnormalized density (often called energy). We further +develop this perspective, incorporate importance sampling, and show that 1) +Wasserstein GAN performs a biased estimate of the partition function, and we +propose instead to use an unbiased estimator; 2) when optimizing for +likelihood, one must maximize generator entropy. This is hypothesized to +provide a better mode coverage. Different from previous works, we explicitly +compute the density of the generated samples. This is the key enabler to +designing an unbiased estimator of the partition function and computation of +the generator entropy term. The generator density is obtained via a new type of +flow network, called one-way flow network, that is less constrained in terms of +architecture, as it does not require to have a tractable inverse function. Our +experimental results show that we converge faster, produce comparable sample +quality to GANs with similar architecture, successfully avoid over-fitting to +commonly used datasets and produce smooth low-dimensional latent +representations of the training data. + +
+
+
+
+
+ + ☆ Detecting Vulnerable Nodes in Urban Infrastructure Interdependent + Network + + +
+ Understanding and characterizing the vulnerability of urban infrastructures, +which refers to the engineering facilities essential for the regular running of +cities and that exist naturally in the form of networks, is of great value to +us. Potential applications include protecting fragile facilities and designing +robust topologies, etc. Due to the strong correlation between different +topological characteristics and infrastructure vulnerability and their +complicated evolution mechanisms, some heuristic and machine-assisted analysis +fall short in addressing such a scenario. In this paper, we model the +interdependent network as a heterogeneous graph and propose a system based on +graph neural network with reinforcement learning, which can be trained on +real-world data, to characterize the vulnerability of the city system +accurately. The presented system leverages deep learning techniques to +understand and analyze the heterogeneous graph, which enables us to capture the +risk of cascade failure and discover vulnerable infrastructures of cities. +Extensive experiments with various requests demonstrate not only the expressive +power of our system but also transferring ability and necessity of the specific +components. + +
+
+
+
+
+ + ☆ Towards a population-informed approach to the definition of data-driven + models for structural dynamics + + +
+ Machine learning has affected the way in which many phenomena for various +domains are modelled, one of these domains being that of structural dynamics. +However, because machine-learning algorithms are problem-specific, they often +fail to perform efficiently in cases of data scarcity. To deal with such +issues, combination of physics-based approaches and machine learning algorithms +have been developed. Although such methods are effective, they also require the +analyser's understanding of the underlying physics of the problem. The current +work is aimed at motivating the use of models which learn such relationships +from a population of phenomena, whose underlying physics are similar. The +development of such models is motivated by the way that physics-based models, +and more specifically finite element models, work. Such models are considered +transferrable, explainable and trustworthy, attributes which are not trivially +imposed or achieved for machine-learning models. For this reason, +machine-learning approaches are less trusted by industry and often considered +more difficult to form validated models. To achieve such data-driven models, a +population-based scheme is followed here and two different machine-learning +algorithms from the meta-learning domain are used. The two algorithms are the +model-agnostic meta-learning (MAML) algorithm and the conditional neural +processes (CNP) model. The algorithms seem to perform as intended and +outperform a traditional machine-learning algorithm at approximating the +quantities of interest. Moreover, they exhibit behaviour similar to traditional +machine learning algorithms (e.g. neural networks or Gaussian processes), +concerning their performance as a function of the available structures in the +training population. + +
+
+
+
+
+ + ☆ Reinforcement Learning for Credit Index Option Hedging + + +
+ In this paper, we focus on finding the optimal hedging strategy of a credit +index option using reinforcement learning. We take a practical approach, where +the focus is on realism i.e. discrete time, transaction costs; even testing our +policy on real market data. We apply a state of the art algorithm, the Trust +Region Volatility Optimization (TRVO) algorithm and show that the derived +hedging strategy outperforms the practitioner's Black & Scholes delta hedge. + +
+
+
+
+
+ + ☆ Near-Linear Time Projection onto the $\ell_{1,\infty}$ Ball; Application + to Sparse Autoencoders + + +
+ Looking for sparsity is nowadays crucial to speed up the training of +large-scale neural networks. Projections onto the $\ell_{1,2}$ and +$\ell_{1,\infty}$ are among the most efficient techniques to sparsify and +reduce the overall cost of neural networks. In this paper, we introduce a new +projection algorithm for the $\ell_{1,\infty}$ norm ball. The worst-case time +complexity of this algorithm is $\mathcal{O}\big(nm+J\log(nm)\big)$ for a +matrix in $\mathbb{R}^{n\times m}$. $J$ is a term that tends to 0 when the +sparsity is high, and to $nm$ when the sparsity is low. Its implementation is +easy and it is guaranteed to converge to the exact solution in a finite time. +Moreover, we propose to incorporate the $\ell_{1,\infty}$ ball projection while +training an autoencoder to enforce feature selection and sparsity of the +weights. Sparsification appears in the encoder to primarily do feature +selection due to our application in biology, where only a very small part +($<2\%$) of the data is relevant. We show that both in the biological case and +in the general case of sparsity that our method is the fastest. + +
+
+ comment: 22 pages, 8 figures +
+
+
+
+
+ + ☆ Deep Operator Network Approximation Rates for Lipschitz Operators + + +
+ We establish universality and expression rate bounds for a class of neural +Deep Operator Networks (DON) emulating Lipschitz (or H\"older) continuous maps +$\mathcal G:\mathcal X\to\mathcal Y$ between (subsets of) separable Hilbert +spaces $\mathcal X$, $\mathcal Y$. The DON architecture considered uses linear +encoders $\mathcal E$ and decoders $\mathcal D$ via (biorthogonal) Riesz bases +of $\mathcal X$, $\mathcal Y$, and an approximator network of an +infinite-dimensional, parametric coordinate map that is Lipschitz continuous on +the sequence space $\ell^2(\mathbb N)$. Unlike previous works ([Herrmann, +Schwab and Zech: Neural and Spectral operator surrogates: construction and +expression rate bounds, SAM Report, 2022], [Marcati and Schwab: Exponential +Convergence of Deep Operator Networks for Elliptic Partial Differential +Equations, SAM Report, 2022]), which required for example $\mathcal G$ to be +holomorphic, the present expression rate results require mere Lipschitz (or +H\"older) continuity of $\mathcal G$. Key in the proof of the present +expression rate bounds is the use of either super-expressive activations (e.g. +[Yarotski: Elementary superexpressive activations, Int. Conf. on ML, 2021], +[Shen, Yang and Zhang: Neural network approximation: Three hidden layers are +enough, Neural Networks, 2021], and the references there) which are inspired by +the Kolmogorov superposition theorem, or of nonstandard NN architectures with +standard (ReLU) activations as recently proposed in [Zhang, Shen and Yang: +Neural Network Architecture Beyond Width and Depth, Adv. in Neural Inf. Proc. +Sys., 2022]. We illustrate the abstract results by approximation rate bounds +for emulation of a) solution operators for parametric elliptic variational +inequalities, and b) Lipschitz maps of Hilbert-Schmidt operators. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ☆ What do neural networks learn in image classification? A frequency + shortcut perspective ICCV2023 + + +
+ Frequency analysis is useful for understanding the mechanisms of +representation learning in neural networks (NNs). Most research in this area +focuses on the learning dynamics of NNs for regression tasks, while little for +classification. This study empirically investigates the latter and expands the +understanding of frequency shortcuts. First, we perform experiments on +synthetic datasets, designed to have a bias in different frequency bands. Our +results demonstrate that NNs tend to find simple solutions for classification, +and what they learn first during training depends on the most distinctive +frequency characteristics, which can be either low- or high-frequencies. +Second, we confirm this phenomenon on natural images. We propose a metric to +measure class-wise frequency characteristics and a method to identify frequency +shortcuts. The results show that frequency shortcuts can be texture-based or +shape-based, depending on what best simplifies the objective. Third, we +validate the transferability of frequency shortcuts on out-of-distribution +(OOD) test sets. Our results suggest that frequency shortcuts can be +transferred across datasets and cannot be fully avoided by larger model +capacity and data augmentation. We recommend that future research should focus +on effective training schemes mitigating frequency shortcut learning. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ☆ Multi-modal Learning based Prediction for Disease + + +
+ Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic +liver disease, which can be predicted accurately to prevent advanced fibrosis +and cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is +invasive, expensive, and prone to sampling errors. Therefore, non-invasive +studies are extremely promising, yet they are still in their infancy due to the +lack of comprehensive research data and intelligent methods for multi-modal +data. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a +comprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD +prediction method (DeepFLD). The dataset includes over 6000 participants +physical examinations, laboratory and imaging studies, extensive +questionnaires, and facial images of partial participants, which is +comprehensive and valuable for clinical studies. From the dataset, we +quantitatively analyze and select clinical metadata that most contribute to +NAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network +model designed to predict NAFLD using multi-modal input, including metadata and +facial images, outperforms the approach that only uses metadata. Satisfactory +performance is also verified on other unseen datasets. Inspiringly, DeepFLD can +achieve competitive results using only facial images as input rather than +metadata, paving the way for a more robust and simpler non-invasive NAFLD +diagnosis. + +
+
+
+
+
+ + ☆ Deep unrolling Shrinkage Network for Dynamic MR imaging + + +
+ Deep unrolling networks that utilize sparsity priors have achieved great +success in dynamic magnetic resonance (MR) imaging. The convolutional neural +network (CNN) is usually utilized to extract the transformed domain, and then +the soft thresholding (ST) operator is applied to the CNN-transformed data to +enforce the sparsity priors. However, the ST operator is usually constrained to +be the same across all channels of the CNN-transformed data. In this paper, we +propose a novel operator, called soft thresholding with channel attention +(AST), that learns the threshold for each channel. In particular, we put +forward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the +alternating direction method of multipliers (ADMM) for optimizing the +transformed $l_1$ norm dynamic MR reconstruction model. Experimental results on +an open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net +outperforms the state-of-the-art methods. The source code is available at +\url{https://github.com/yhao-z/DUS-Net}. + +
+
+ comment: 5 pages,3 figures,2 tables +
+
+
+
+
+ + ☆ Manifold Learning with Sparse Regularised Optimal Transport + + +
+ Manifold learning is a central task in modern statistics and data science. +Many datasets (cells, documents, images, molecules) can be represented as point +clouds embedded in a high dimensional ambient space, however the degrees of +freedom intrinsic to the data are usually far fewer than the number of ambient +dimensions. The task of detecting a latent manifold along which the data are +embedded is a prerequisite for a wide family of downstream analyses. Real-world +datasets are subject to noisy observations and sampling, so that distilling +information about the underlying manifold is a major challenge. We propose a +method for manifold learning that utilises a symmetric version of optimal +transport with a quadratic regularisation that constructs a sparse and adaptive +affinity matrix, that can be interpreted as a generalisation of the +bistochastic kernel normalisation. We prove that the resulting kernel is +consistent with a Laplace-type operator in the continuous limit, establish +robustness to heteroskedastic noise and exhibit these results in simulations. +We identify a highly efficient computational scheme for computing this optimal +transport for discrete data and demonstrate that it outperforms competing +methods in a set of examples. + +
+
+
+
+
+ + ☆ GenKL: An Iterative Framework for Resolving Label Ambiguity and Label + Non-conformity in Web Images Via a New Generalized KL Divergence + + +
+ Web image datasets curated online inherently contain ambiguous +in-distribution (ID) instances and out-of-distribution (OOD) instances, which +we collectively call non-conforming (NC) instances. In many recent approaches +for mitigating the negative effects of NC instances, the core implicit +assumption is that the NC instances can be found via entropy maximization. For +"entropy" to be well-defined, we are interpreting the output prediction vector +of an instance as the parameter vector of a multinomial random variable, with +respect to some trained model with a softmax output layer. Hence, entropy +maximization is based on the idealized assumption that NC instances have +predictions that are "almost" uniformly distributed. However, in real-world web +image datasets, there are numerous NC instances whose predictions are far from +being uniformly distributed. To tackle the limitation of entropy maximization, +we propose $(\alpha, \beta)$-generalized KL divergence, +$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$, which can be used to identify +significantly more NC instances. Theoretical properties of +$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$ are proven, and we also show +empirically that a simple use of $\mathcal{D}_{\text{KL}}^{\alpha, +\beta}(p\|q)$ outperforms all baselines on the NC instance identification task. +Building upon $(\alpha,\beta)$-generalized KL divergence, we also introduce a +new iterative training framework, GenKL, that identifies and relabels NC +instances. When evaluated on three web image datasets, Clothing1M, +Food101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art +classification accuracies: $81.34\%$, $85.73\%$ and $78.99\%$/$92.54\%$ +(top-1/top-5), respectively. + +
+
+ comment: Published (with open access) at International Journal of Computer + Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at: + https://github.com/codetopaper/GenKL +
+
+
+
+
+ + ☆ Graph Federated Learning Based on the Decentralized Framework + + +
+ Graph learning has a wide range of applications in many scenarios, which +require more need for data privacy. Federated learning is an emerging +distributed machine learning approach that leverages data from individual +devices or data centers to improve the accuracy and generalization of the +model, while also protecting the privacy of user data. Graph-federated learning +is mainly based on the classical federated learning framework i.e., the +Client-Server framework. However, the Client-Server framework faces problems +such as a single point of failure of the central server and poor scalability of +network topology. First, we introduce the decentralized framework to +graph-federated learning. Second, determine the confidence among nodes based on +the similarity of data among nodes, subsequently, the gradient information is +then aggregated by linear weighting based on confidence. Finally, the proposed +method is compared with FedAvg, Fedprox, GCFL, and GCFL+ to verify the +effectiveness of the proposed method. Experiments demonstrate that the proposed +method outperforms other methods. + +
+
+ comment: 12 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Probabilistic Forecasting with Coherent Aggregation + + +
+ Obtaining accurate probabilistic forecasts while respecting hierarchical +information is an important operational challenge in many applications, perhaps +most obviously in energy management, supply chain planning, and resource +allocation. The basic challenge, especially for multivariate forecasting, is +that forecasts are often required to be coherent with respect to the +hierarchical structure. In this paper, we propose a new model which leverages a +factor model structure to produce coherent forecasts by construction. This is a +consequence of a simple (exchangeability) observation: permuting +\textit{}base-level series in the hierarchy does not change their aggregates. +Our model uses a convolutional neural network to produce parameters for the +factors, their loadings and base-level distributions; it produces samples which +can be differentiated with respect to the model's parameters; and it can +therefore optimize for any sample-based loss function, including the Continuous +Ranked Probability Score and quantile losses. We can choose arbitrary +continuous distributions for the factor and the base-level distributions. We +compare our method to two previous methods which can be optimized end-to-end, +while enforcing coherent aggregation. Our model achieves significant +improvements: between $11.8-41.4\%$ on three hierarchical forecasting datasets. +We also analyze the influence of parameters in our model with respect to +base-level distribution and number of factors. + +
+
+
+
+
+ + ☆ Forecasting Early with Meta Learning IJCNN 2023 + + +
+ In the early observation period of a time series, there might be only a few +historic observations available to learn a model. However, in cases where an +existing prior set of datasets is available, Meta learning methods can be +applicable. In this paper, we devise a Meta learning method that exploits +samples from additional datasets and learns to augment time series through +adversarial learning as an auxiliary task for the target dataset. Our model +(FEML), is equipped with a shared Convolutional backbone that learns features +for varying length inputs from different datasets and has dataset specific +heads to forecast for different output lengths. We show that FEML can meta +learn across datasets and by additionally learning on adversarial generated +samples as auxiliary samples for the target dataset, it can improve the +forecasting performance compared to single task learning, and various solutions +adapted from Joint learning, Multi-task learning and classic forecasting +baselines. + +
+
+ comment: IJCNN 2023 +
+
+
+
+
+ + ☆ From West to East: Who can understand the music of the others better? + + +
+ Recent developments in MIR have led to several benchmark deep learning models +whose embeddings can be used for a variety of downstream tasks. At the same +time, the vast majority of these models have been trained on Western pop/rock +music and related styles. This leads to research questions on whether these +models can be used to learn representations for different music cultures and +styles, or whether we can build similar music audio embedding models trained on +data from different cultures or styles. To that end, we leverage transfer +learning methods to derive insights about the similarities between the +different music cultures to which the data belongs to. We use two Western music +datasets, two traditional/folk datasets coming from eastern Mediterranean +cultures, and two datasets belonging to Indian art music. Three deep audio +embedding models are trained and transferred across domains, including two +CNN-based and a Transformer-based architecture, to perform auto-tagging for +each target domain dataset. Experimental results show that competitive +performance is achieved in all domains via transfer learning, while the best +source dataset varies for each music culture. The implementation and the +trained models are both provided in a public repository. + +
+
+
+
+
+ + ☆ A Note on Hardness of Computing Recursive Teaching Dimension + + +
+ In this short note, we show that the problem of computing the recursive +teaching dimension (RTD) for a concept class (given explicitly as input) +requires $n^{\Omega(\log n)}$-time, assuming the exponential time hypothesis +(ETH). This matches the running time $n^{O(\log n)}$ of the brute-force +algorithm for the problem. + +
+
+ comment: To appear in IPL +
+
+
+
+
+ + ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization + Using Floating-Point Formats + + +
+ In the complex domain of large language models (LLMs), striking a balance +between computational efficiency and maintaining model quality is a formidable +challenge. Navigating the inherent limitations of uniform quantization, +particularly when dealing with outliers, and motivated by the launch of +NVIDIA's H100 hardware, this study delves into the viability of floating-point +(FP) quantization, particularly focusing on FP8 and FP4, as a potential +solution. Our comprehensive investigation reveals that for LLMs, FP8 activation +consistently outshines its integer (INT8) equivalent, with the performance edge +becoming more noticeable in models possessing parameters beyond one billion. +For weight quantization, our findings indicate that FP4 exhibits comparable, if +not superior, performance to INT4, simplifying deployment on FP-supported +hardware like H100. To mitigate the overhead from precision alignment caused by +the disparity between weights and activations, we propose two scaling +constraints for weight quantization that negligibly impact the performance +compared to the standard W4A8 model. We additionally enhance our quantization +methods by integrating the Low Rank Compensation (LoRC) strategy, yielding +improvements especially in smaller models. The results of our investigation +emphasize the immense potential of FP quantization for LLMs, paving the way for +high-efficiency deployment in resource-limited settings. + +
+
+
+
+
+ + ☆ Text2Layer: Layered Image Generation using Latent Diffusion Model + + +
+ Layer compositing is one of the most popular image editing workflows among +both amateurs and professionals. Motivated by the success of diffusion models, +we explore layer compositing from a layered image generation perspective. +Instead of generating an image, we propose to generate background, foreground, +layer mask, and the composed image simultaneously. To achieve layered image +generation, we train an autoencoder that is able to reconstruct layered images +and train diffusion models on the latent representation. One benefit of the +proposed problem is to enable better compositing workflows in addition to the +high-quality image output. Another benefit is producing higher-quality layer +masks compared to masks produced by a separate step of image segmentation. +Experimental results show that the proposed method is able to generate +high-quality layered images and initiates a benchmark for future work. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ Beyond Single-Feature Importance with ICECREAM + + +
+ Which set of features was responsible for a certain output of a machine +learning model? Which components caused the failure of a cloud computing +application? These are just two examples of questions we are addressing in this +work by Identifying Coalition-based Explanations for Common and Rare Events in +Any Model (ICECREAM). Specifically, we propose an information-theoretic +quantitative measure for the influence of a coalition of variables on the +distribution of a target variable. This allows us to identify which set of +factors is essential to obtain a certain outcome, as opposed to +well-established explainability and causal contribution analysis methods which +can assign contributions only to individual factors and rank them by their +importance. In experiments with synthetic and real-world data, we show that +ICECREAM outperforms state-of-the-art methods for explainability and root cause +analysis, and achieves impressive accuracy in both tasks. + +
+
+
+
+
+ + ☆ A Novel Spatial-Temporal Variational Quantum Circuit to Enable Deep + Learning on NISQ Devices + + +
+ Quantum computing presents a promising approach for machine learning with its +capability for extremely parallel computation in high-dimension through +superposition and entanglement. Despite its potential, existing quantum +learning algorithms, such as Variational Quantum Circuits(VQCs), face +challenges in handling more complex datasets, particularly those that are not +linearly separable. What's more, it encounters the deployability issue, making +the learning models suffer a drastic accuracy drop after deploying them to the +actual quantum devices. To overcome these limitations, this paper proposes a +novel spatial-temporal design, namely ST-VQC, to integrate non-linearity in +quantum learning and improve the robustness of the learning model to noise. +Specifically, ST-VQC can extract spatial features via a novel block-based +encoding quantum sub-circuit coupled with a layer-wise computation quantum +sub-circuit to enable temporal-wise deep learning. Additionally, a SWAP-Free +physical circuit design is devised to improve robustness. These designs bring a +number of hyperparameters. After a systematic analysis of the design space for +each design component, an automated optimization framework is proposed to +generate the ST-VQC quantum circuit. The proposed ST-VQC has been evaluated on +two IBM quantum processors, ibm_cairo with 27 qubits and ibmq_lima with 7 +qubits to assess its effectiveness. The results of the evaluation on the +standard dataset for binary classification show that ST-VQC can achieve over +30% accuracy improvement compared with existing VQCs on actual quantum +computers. Moreover, on a non-linear synthetic dataset, the ST-VQC outperforms +a linear classifier by 27.9%, while the linear classifier using classical +computing outperforms the existing VQC by 15.58%. + +
+
+
+
+
+ + ☆ How Curvature Enhance the Adaptation Power of Framelet GCNs + + +
+ Graph neural network (GNN) has been demonstrated powerful in modeling +graph-structured data. However, despite many successful cases of applying GNNs +to various graph classification and prediction tasks, whether the graph +geometrical information has been fully exploited to enhance the learning +performance of GNNs is not yet well understood. This paper introduces a new +approach to enhance GNN by discrete graph Ricci curvature. Specifically, the +graph Ricci curvature defined on the edges of a graph measures how difficult +the information transits on one edge from one node to another based on their +neighborhoods. Motivated by the geometric analogy of Ricci curvature in the +graph setting, we prove that by inserting the curvature information with +different carefully designed transformation function $\zeta$, several known +computational issues in GNN such as over-smoothing can be alleviated in our +proposed model. Furthermore, we verified that edges with very positive Ricci +curvature (i.e., $\kappa_{i,j} \approx 1$) are preferred to be dropped to +enhance model's adaption to heterophily graph and one curvature based graph +edge drop algorithm is proposed. Comprehensive experiments show that our +curvature-based GNN model outperforms the state-of-the-art baselines in both +homophily and heterophily graph datasets, indicating the effectiveness of +involving graph geometric information in GNNs. + +
+
+
+
+
+ + ☆ Sig-Splines: universal approximation and convex calibration of time + series generative models + + +
+ We propose a novel generative model for multivariate discrete-time time +series data. Drawing inspiration from the construction of neural spline flows, +our algorithm incorporates linear transformations and the signature transform +as a seamless substitution for traditional neural networks. This approach +enables us to achieve not only the universality property inherent in neural +networks but also introduces convexity in the model's parameters. + +
+
+
+
+
+ + ☆ Reinforcing POD based model reduction techniques in reaction-diffusion + complex networks using stochastic filtering and pattern recognition + + +
+ Complex networks are used to model many real-world systems. However, the +dimensionality of these systems can make them challenging to analyze. +Dimensionality reduction techniques like POD can be used in such cases. +However, these models are susceptible to perturbations in the input data. We +propose an algorithmic framework that combines techniques from pattern +recognition (PR) and stochastic filtering theory to enhance the output of such +models. The results of our study show that our method can improve the accuracy +of the surrogate model under perturbed inputs. Deep Neural Networks (DNNs) are +susceptible to adversarial attacks. However, recent research has revealed that +neural Ordinary Differential Equations (ODEs) exhibit robustness in specific +applications. We benchmark our algorithmic framework with a Neural ODE-based +approach as a reference. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Constructing Extreme Learning Machines with zero Spectral Bias + + +
+ The phenomena of Spectral Bias, where the higher frequency components of a +function being learnt in a feedforward Artificial Neural Network (ANN) are seen +to converge more slowly than the lower frequencies, is observed ubiquitously +across ANNs. This has created technology challenges in fields where resolution +of higher frequencies is crucial, like in Physics Informed Neural Networks +(PINNs). Extreme Learning Machines (ELMs) that obviate an iterative solution +process which provides the theoretical basis of Spectral Bias (SB), should in +principle be free of the same. This work verifies the reliability of this +assumption, and shows that it is incorrect. However, the structure of ELMs +makes them naturally amenable to implementation of variants of Fourier Feature +Embeddings, which have been shown to mitigate SB in ANNs. This approach is +implemented and verified to completely eliminate SB, thus bringing into +feasibility the application of ELMs for practical problems like PINNs where +resolution of higher frequencies is essential. + +
+
+
+
+
+ + ☆ Improved Distribution Matching for Dataset Condensation CVPR2023 + + +
+ Dataset Condensation aims to condense a large dataset into a smaller one +while maintaining its ability to train a well-performing model, thus reducing +the storage cost and training effort in deep learning applications. However, +conventional dataset condensation methods are optimization-oriented and +condense the dataset by performing gradient or parameter matching during model +optimization, which is computationally intensive even on small datasets and +models. In this paper, we propose a novel dataset condensation method based on +distribution matching, which is more efficient and promising. Specifically, we +identify two important shortcomings of naive distribution matching (i.e., +imbalanced feature numbers and unvalidated embeddings for distance computation) +and address them with three novel techniques (i.e., partitioning and expansion +augmentation, efficient and enriched model sampling, and class-aware +distribution regularization). Our simple yet effective method outperforms most +previous optimization-oriented methods with much fewer computational resources, +thereby scaling data condensation to larger datasets and models. Extensive +experiments demonstrate the effectiveness of our method. Codes are available at +https://github.com/uitrbn/IDM + +
+
+ comment: CVPR2023 +
+
+
+
+
+ + ☆ RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap ACL + + +
+ Taxonomies are an essential knowledge representation, yet most studies on +automatic taxonomy construction (ATC) resort to manual evaluation to score +proposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just +as important as taxonomy construction. We propose RaTE, an automatic label-free +taxonomy scoring procedure, which relies on a large pre-trained language model. +We apply our evaluation procedure to three state-of-the-art ATC algorithms with +which we built seven taxonomies from the Yelp domain, and show that 1) RaTE +correlates well with human judgments and 2) artificially degrading a taxonomy +leads to decreasing RaTE score. + +
+
+ comment: 15th International Conference on Computational Semantics (IWCS), + Association for Computational Linguistics (ACL) +
+
+
+
+
+ + ☆ Efficient Guided Generation for LLMs + + +
+ In this article we describe an efficient approach to guiding language model +text generation with regular expressions and context-free grammars. Our +approach adds little to no overhead to the token sequence generation process, +and makes guided generation feasible in practice. An implementation is provided +in the open source Python library Outlines. + +
+
+
+
+
+ + ☆ STRAPPER: Preference-based Reinforcement Learning via Self-training + Augmentation and Peer Regularization + + +
+ Preference-based reinforcement learning (PbRL) promises to learn a complex +reward function with binary human preference. However, such human-in-the-loop +formulation requires considerable human effort to assign preference labels to +segment pairs, hindering its large-scale applications. Recent approache has +tried to reuse unlabeled segments, which implicitly elucidates the distribution +of segments and thereby alleviates the human effort. And consistency +regularization is further considered to improve the performance of +semi-supervised learning. However, we notice that, unlike general +classification tasks, in PbRL there exits a unique phenomenon that we defined +as similarity trap in this paper. Intuitively, human can have diametrically +opposite preferredness for similar segment pairs, but such similarity may trap +consistency regularization fail in PbRL. Due to the existence of similarity +trap, such consistency regularization improperly enhances the consistency +possiblity of the model's predictions between segment pairs, and thus reduces +the confidence in reward learning, since the augmented distribution does not +match with the original one in PbRL. To overcome such issue, we present a +self-training method along with our proposed peer regularization, which +penalizes the reward model memorizing uninformative labels and acquires +confident predictions. Empirically, we demonstrate that our approach is capable +of learning well a variety of locomotion and robotic manipulation behaviors +using different semi-supervised alternatives and peer regularization. + +
+
+
+
+
+ + ☆ Joint Service Caching, Communication and Computing Resource Allocation + in Collaborative MEC Systems: A DRL-based Two-timescale Approach + + +
+ Meeting the strict Quality of Service (QoS) requirements of terminals has +imposed a signiffcant challenge on Multiaccess Edge Computing (MEC) systems, +due to the limited multidimensional resources. To address this challenge, we +propose a collaborative MEC framework that facilitates resource sharing between +the edge servers, and with the aim to maximize the long-term QoS and reduce the +cache switching cost through joint optimization of service caching, +collaborative offfoading, and computation and communication resource +allocation. The dual timescale feature and temporal recurrence relationship +between service caching and other resource allocation make solving the problem +even more challenging. To solve it, we propose a deep reinforcement learning +(DRL)-based dual timescale scheme, called DGL-DDPG, which is composed of a +short-term genetic algorithm (GA) and a long short-term memory network-based +deep deterministic policy gradient (LSTM-DDPG). In doing so, we reformulate the +optimization problem as a Markov decision process (MDP) where the +small-timescale resource allocation decisions generated by an improved GA are +taken as the states and input into a centralized LSTM-DDPG agent to generate +the service caching decision for the large-timescale. Simulation results +demonstrate that our proposed algorithm outperforms the baseline algorithms in +terms of the average QoS and cache switching cost. + +
+
+
+
+
+ + ☆ Amazon-M2: A Multilingual Multi-locale Shopping Session Dataset for + Recommendation and Text Generation KDD + + +
+ Modeling customer shopping intentions is a crucial task for e-commerce, as it +directly impacts user experience and engagement. Thus, accurately understanding +customer preferences is essential for providing personalized recommendations. +Session-based recommendation, which utilizes customer session data to predict +their next interaction, has become increasingly popular. However, existing +session datasets have limitations in terms of item attributes, user diversity, +and dataset scale. As a result, they cannot comprehensively capture the +spectrum of user behaviors and preferences. To bridge this gap, we present the +Amazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It +is the first multilingual dataset consisting of millions of user sessions from +six different locales, where the major languages of products are English, +German, Japanese, French, Italian, and Spanish. Remarkably, the dataset can +help us enhance personalization and understanding of user preferences, which +can benefit various existing tasks as well as enable new tasks. To test the +potential of the dataset, we introduce three tasks in this work: (1) +next-product recommendation, (2) next-product recommendation with domain +shifts, and (3) next-product title generation. With the above tasks, we +benchmark a range of algorithms on our proposed dataset, drawing new insights +for further research and practice. In addition, based on the proposed dataset +and tasks, we hosted a competition in the KDD CUP 2023 and have attracted +thousands of users and submissions. The winning solutions and the associated +workshop can be accessed at our website https://kddcup23.github.io/. + +
+
+ comment: Dataset for KDD Cup 2023, https://kddcup23.github.io/ +
+
+
+
+
+ + ♻ ☆ Sequential Kernelized Independence Testing ICML 2023 + + +
+ Independence testing is a classical statistical problem that has been +extensively studied in the batch setting when one fixes the sample size before +collecting data. However, practitioners often prefer procedures that adapt to +the complexity of a problem at hand instead of setting sample size in advance. +Ideally, such procedures should (a) stop earlier on easy tasks (and later on +harder tasks), hence making better use of available resources, and (b) +continuously monitor the data and efficiently incorporate statistical evidence +after collecting new data, while controlling the false alarm rate. Classical +batch tests are not tailored for streaming data: valid inference after data +peeking requires correcting for multiple testing which results in low power. +Following the principle of testing by betting, we design sequential kernelized +independence tests that overcome such shortcomings. We exemplify our broad +framework using bets inspired by kernelized dependence measures, e.g., the +Hilbert-Schmidt independence criterion. Our test is also valid under +non-i.i.d., time-varying settings. We demonstrate the power of our approaches +on both simulated and real data. + +
+
+ comment: To appear at ICML 2023 +
+
+
+
+
+ + ♻ ☆ Efficient Bayesian travel-time tomography with geologically-complex + priors using sensitivity-informed polynomial chaos expansion and deep + generative networks + + +
+ Monte Carlo Markov Chain (MCMC) methods commonly confront two fundamental +challenges: the accurate characterization of the prior distribution and the +efficient evaluation of the likelihood. In the context of Bayesian studies on +tomography, principal component analysis (PCA) can in some cases facilitate the +straightforward definition of the prior distribution, while simultaneously +enabling the implementation of accurate surrogate models based on polynomial +chaos expansion (PCE) to replace computationally intensive full-physics forward +solvers. When faced with scenarios where PCA does not offer a direct means of +easily defining the prior distribution alternative methods like deep generative +models (e.g., variational autoencoders (VAEs)), can be employed as viable +options. However, accurately producing a surrogate capable of capturing the +intricate non-linear relationship between the latent parameters of a VAE and +the outputs of forward modeling presents a notable challenge. Indeed, while PCE +models provide high accuracy when the input-output relationship can be +effectively approximated by relatively low-degree multivariate polynomials, +this condition is typically unmet when utilizing latent variables derived from +deep generative models. In this contribution, we present a strategy that +combines the excellent reconstruction performances of VAE in terms of prio +representation with the accuracy of PCA-PCE surrogate modeling in the context +of Bayesian ground penetrating radar (GPR) travel-time tomography. Within the +MCMC process, the parametrization of the VAE is leveraged for prior exploration +and sample proposal. Concurrently, modeling is conducted using PCE, which +operates on either globally or locally defined principal components of the VAE +samples under examination. + +
+
+ comment: 25 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Value Summation: A Novel Scoring Function for MPC-based Model-based + Reinforcement Learning + + +
+ This paper proposes a novel scoring function for the planning module of +MPC-based reinforcement learning methods to address the inherent bias of using +the reward function to score trajectories. The proposed method enhances the +learning efficiency of existing MPC-based MBRL methods using the discounted sum +of values. The method utilizes optimal trajectories to guide policy learning +and updates its state-action value function based on real-world and augmented +onboard data. The learning efficiency of the proposed method is evaluated in +selected MuJoCo Gym environments as well as in learning locomotion skills for a +simulated model of the Cassie robot. The results demonstrate that the proposed +method outperforms the current state-of-the-art algorithms in terms of learning +efficiency and average reward return. + +
+
+
+
+
+ + ♻ ☆ A benchmark of categorical encoders for binary classification NeurIPS 2023 + + +
+ Categorical encoders transform categorical features into numerical +representations that are indispensable for a wide range of machine learning +models. Existing encoder benchmark studies lack generalizability because of +their limited choice of (1) encoders, (2) experimental factors, and (3) +datasets. Additionally, inconsistencies arise from the adoption of varying +aggregation strategies. This paper is the most comprehensive benchmark of +categorical encoders to date, including an extensive evaluation of 32 +configurations of encoders from diverse families, with 36 combinations of +experimental factors, and on 50 datasets. The study shows the profound +influence of dataset selection, experimental factors, and aggregation +strategies on the benchmark's conclusions -- aspects disregarded in previous +encoder benchmarks. + +
+
+ comment: Submitted to the 37th Conference on Neural Information Processing + Systems (NeurIPS 2023) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Evaluation of Complexity Measures for Deep Learning Generalization in + Medical Image Analysis + + +
+ The generalization performance of deep learning models for medical image +analysis often decreases on images collected with different devices for data +acquisition, device settings, or patient population. A better understanding of +the generalization capacity on new images is crucial for clinicians' +trustworthiness in deep learning. Although significant research efforts have +been recently directed toward establishing generalization bounds and complexity +measures, still, there is often a significant discrepancy between the predicted +and actual generalization performance. As well, related large empirical studies +have been primarily based on validation with general-purpose image datasets. +This paper presents an empirical study that investigates the correlation +between 25 complexity measures and the generalization abilities of supervised +deep learning classifiers for breast ultrasound images. The results indicate +that PAC-Bayes flatness-based and path norm-based measures produce the most +consistent explanation for the combination of models and data. We also +investigate the use of multi-task classification and segmentation approach for +breast images, and report that such learning approach acts as an implicit +regularizer and is conducive toward improved generalization. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Pre or Post-Softmax Scores in Gradient-based Attribution Methods, What + is Best? ICPR + + +
+ Gradient based attribution methods for neural networks working as classifiers +use gradients of network scores. Here we discuss the practical differences +between using gradients of pre-softmax scores versus post-softmax scores, and +their respective advantages and disadvantages. + +
+
+ comment: 8 pages, 2 figures, 2023 IEEE 13th International Conference on + Pattern Recognition Systems (ICPRS) +
+
+
+
+
+ + ♻ ☆ SurCo: Learning Linear Surrogates For Combinatorial Nonlinear + Optimization Problems + + +
+ Optimization problems with nonlinear cost functions and combinatorial +constraints appear in many real-world applications but remain challenging to +solve efficiently compared to their linear counterparts. To bridge this gap, we +propose $\textbf{SurCo}$ that learns linear $\underline{\text{Sur}}$rogate +costs which can be used in existing $\underline{\text{Co}}$mbinatorial solvers +to output good solutions to the original nonlinear combinatorial optimization +problem. The surrogate costs are learned end-to-end with nonlinear loss by +differentiating through the linear surrogate solver, combining the flexibility +of gradient-based methods with the structure of linear combinatorial +optimization. We propose three $\texttt{SurCo}$ variants: +$\texttt{SurCo}-\texttt{zero}$ for individual nonlinear problems, +$\texttt{SurCo}-\texttt{prior}$ for problem distributions, and +$\texttt{SurCo}-\texttt{hybrid}$ to combine both distribution and +problem-specific information. We give theoretical intuition motivating +$\texttt{SurCo}$, and evaluate it empirically. Experiments show that +$\texttt{SurCo}$ finds better solutions faster than state-of-the-art and domain +expert approaches in real-world optimization problems such as embedding table +sharding, inverse photonic design, and nonlinear route planning. + +
+
+
+
+
+ + ♻ ☆ ConCerNet: A Contrastive Learning Based Framework for Automated + Conservation Law Discovery and Trustworthy Dynamical System Prediction ICML 2023 + + +
+ Deep neural networks (DNN) have shown great capacity of modeling a dynamical +system; nevertheless, they usually do not obey physics constraints such as +conservation laws. This paper proposes a new learning framework named ConCerNet +to improve the trustworthiness of the DNN based dynamics modeling to endow the +invariant properties. ConCerNet consists of two steps: (i) a contrastive +learning method to automatically capture the system invariants (i.e. +conservation properties) along the trajectory observations; (ii) a neural +projection layer to guarantee that the learned dynamics models preserve the +learned invariants. We theoretically prove the functional relationship between +the learned latent representation and the unknown system invariant function. +Experiments show that our method consistently outperforms the baseline neural +networks in both coordinate error and conservation metrics by a large margin. +With neural network based parameterization and no dependence on prior +knowledge, our method can be extended to complex and large-scale dynamics by +leveraging an autoencoder. + +
+
+ comment: Accepted by ICML 2023 +
+
+
+
+
+ + ♻ ☆ CREPE: Learnable Prompting With CLIP Improves Visual Relationship + Prediction + + +
+ In this paper, we explore the potential of Vision-Language Models (VLMs), +specifically CLIP, in predicting visual object relationships, which involves +interpreting visual features from images into language-based relations. Current +state-of-the-art methods use complex graphical models that utilize language +cues and visual features to address this challenge. We hypothesize that the +strong language priors in CLIP embeddings can simplify these graphical models +paving for a simpler approach. We adopt the UVTransE relation prediction +framework, which learns the relation as a translational embedding with subject, +object, and union box embeddings from a scene. We systematically explore the +design of CLIP-based subject, object, and union-box representations within the +UVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate +Estimation). CREPE utilizes text-based representations for all three bounding +boxes and introduces a novel contrastive training strategy to automatically +infer the text prompt for union-box. Our approach achieves state-of-the-art +performance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual +Genome benchmark, achieving a 15.3\% gain in performance over recent +state-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in +object relation prediction and encourages further research on VLMs in this +challenging domain. + +
+
+
+
+
+ + ♻ ☆ An exponentially-growing family of universal quantum circuits + + +
+ Quantum machine learning has become an area of growing interest but has +certain theoretical and hardware-specific limitations. Notably, the problem of +vanishing gradients, or barren plateaus, renders the training impossible for +circuits with high qubit counts, imposing a limit on the number of qubits that +data scientists can use for solving problems. Independently, angle-embedded +supervised quantum neural networks were shown to produce truncated Fourier +series with a degree directly dependent on two factors: the depth of the +encoding and the number of parallel qubits the encoding applied to. The degree +of the Fourier series limits the model expressivity. This work introduces two +new architectures whose Fourier degrees grow exponentially: the sequential and +parallel exponential quantum machine learning architectures. This is done by +efficiently using the available Hilbert space when encoding, increasing the +expressivity of the quantum encoding. Therefore, the exponential growth allows +staying at the low-qubit limit to create highly expressive circuits avoiding +barren plateaus. Practically, parallel exponential architecture was shown to +outperform the existing linear architectures by reducing their final mean +square error value by up to 44.7% in a one-dimensional test problem. +Furthermore, the feasibility of this technique was also shown on a trapped ion +quantum processing unit. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Uncovering Bias in Personal Informatics + + +
+ Personal informatics (PI) systems, powered by smartphones and wearables, +enable people to lead healthier lifestyles by providing meaningful and +actionable insights that break down barriers between users and their health +information. Today, such systems are used by billions of users for monitoring +not only physical activity and sleep but also vital signs and women's and heart +health, among others. Despite their widespread usage, the processing of +sensitive PI data may suffer from biases, which may entail practical and +ethical implications. In this work, we present the first comprehensive +empirical and analytical study of bias in PI systems, including biases in raw +data and in the entire machine learning life cycle. We use the most detailed +framework to date for exploring the different sources of bias and find that +biases exist both in the data generation and the model learning and +implementation streams. According to our results, the most affected minority +groups are users with health issues, such as diabetes, joint issues, and +hypertension, and female users, whose data biases are propagated or even +amplified by learning models, while intersectional biases can also be observed. + +
+
+
+
+
+ + ♻ ☆ Beyond Accuracy: A Critical Review of Fairness in Machine Learning for + Mobile and Wearable Computing + + +
+ The field of mobile and wearable computing is undergoing a revolutionary +integration of machine learning. Devices can now diagnose diseases, predict +heart irregularities, and unlock the full potential of human cognition. +However, the underlying algorithms powering these predictions are not immune to +biases with respect to sensitive attributes (e.g., gender, race), leading to +discriminatory outcomes. The goal of this work is to explore the extent to +which the mobile and wearable computing community has adopted ways of reporting +information about datasets and models to surface and, eventually, counter +biases. Our systematic review of papers published in the Proceedings of the ACM +Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT) journal from +2018-2022 indicates that, while there has been progress made on algorithmic +fairness, there is still ample room for growth. Our findings show that only a +small portion (5%) of published papers adheres to modern fairness reporting, +while the overwhelming majority thereof focuses on accuracy or error metrics. +To generalize these results across venues of similar scope, we analyzed recent +proceedings of ACM MobiCom, MobiSys, and SenSys, IEEE Pervasive, and IEEE +Transactions on Mobile Computing Computing, and found no deviation from our +primary result. In light of these findings, our work provides practical +guidelines for the design and development of mobile and wearable technologies +that not only strive for accuracy but also fairness. + +
+
+
+
+
+ + ♻ ☆ Finite-Time Analysis of Natural Actor-Critic for POMDPs + + +
+ We consider the reinforcement learning problem for partially observed Markov +decision processes (POMDPs) with large or even countably infinite state spaces, +where the controller has access to only noisy observations of the underlying +controlled Markov chain. We consider a natural actor-critic method that employs +a finite internal memory for policy parameterization, and a multi-step temporal +difference learning algorithm for policy evaluation. We establish, to the best +of our knowledge, the first non-asymptotic global convergence of actor-critic +methods for partially observed systems under function approximation. In +particular, in addition to the function approximation and statistical errors +that also arise in MDPs, we explicitly characterize the error due to the use of +finite-state controllers. This additional error is stated in terms of the total +variation distance between the traditional belief state in POMDPs and the +posterior distribution of the hidden state when using a finite-state +controller. Further, we show that this error can be made small in the case of +sliding-block controllers by using larger block sizes. + +
+
+
+
+
+ + ♻ ☆ Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event + Localization + + +
+ Audio-Visual Event Localization (AVEL) is the task of temporally localizing +and classifying \emph{audio-visual events}, i.e., events simultaneously visible +and audible in a video. In this paper, we solve AVEL in a weakly-supervised +setting, where only video-level event labels (their presence/absence, but not +their locations in time) are available as supervision for training. Our idea is +to use a base model to estimate labels on the training data at a finer temporal +resolution than at the video level and re-train the model with these labels. +I.e., we determine the subset of labels for each \emph{slice} of frames in a +training video by (i) replacing the frames outside the slice with those from a +second video having no overlap in video-level labels, and (ii) feeding this +synthetic video into the base model to extract labels for just the slice in +question. To handle the out-of-distribution nature of our synthetic videos, we +propose an auxiliary objective for the base model that induces more reliable +predictions of the localized event labels as desired. Our three-stage pipeline +outperforms several existing AVEL methods with no architectural changes and +improves performance on a related weakly-supervised task as well. + +
+
+
+
+
+ + ♻ ☆ Planning to Fairly Allocate: Probabilistic Fairness in the Restless + Bandit Setting + + +
+ Restless and collapsing bandits are often used to model budget-constrained +resource allocation in settings where arms have action-dependent transition +probabilities, such as the allocation of health interventions among patients. +However, state-of-the-art Whittle-index-based approaches to this planning +problem either do not consider fairness among arms, or incentivize fairness +without guaranteeing it. We thus introduce ProbFair, a probabilistically fair +policy that maximizes total expected reward and satisfies the budget constraint +while ensuring a strictly positive lower bound on the probability of being +pulled at each timestep. We evaluate our algorithm on a real-world application, +where interventions support continuous positive airway pressure (CPAP) therapy +adherence among patients, as well as on a broader class of synthetic transition +matrices. We find that ProbFair preserves utility while providing fairness +guarantees. + +
+
+
+
+
+ + ♻ ☆ Sionna RT: Differentiable Ray Tracing for Radio Propagation Modeling + + +
+ Sionna is a GPU-accelerated open-source library for link-level simulations +based on TensorFlow. Since release v0.14 it integrates a differentiable ray +tracer (RT) for the simulation of radio wave propagation. This unique feature +allows for the computation of gradients of the channel impulse response and +other related quantities with respect to many system and environment +parameters, such as material properties, antenna patterns, array geometries, as +well as transmitter and receiver orientations and positions. In this paper, we +outline the key components of Sionna RT and showcase example applications such +as learning radio materials and optimizing transmitter orientations by gradient +descent. While classic ray tracing is a crucial tool for 6G research topics +like reconfigurable intelligent surfaces, integrated sensing and +communications, as well as user localization, differentiable ray tracing is a +key enabler for many novel and exciting research directions, for example, +digital twins. + +
+
+ comment: 5 pages, 5 figures, update reflects new features of Sionna RT + introduced in release v0.15 +
+
+
+
+
+ + ♻ ☆ Data Augmentation is a Hyperparameter: Cherry-picked Self-Supervision + for Unsupervised Anomaly Detection is Creating the Illusion of Success + + +
+ Self-supervised learning (SSL) has emerged as a promising alternative to +create supervisory signals to real-world problems, avoiding the extensive cost +of manual labeling. SSL is particularly attractive for unsupervised tasks such +as anomaly detection (AD), where labeled anomalies are rare or often +nonexistent. A large catalog of augmentation functions has been used for +SSL-based AD (SSAD) on image data, and recent works have reported that the type +of augmentation has a significant impact on accuracy. Motivated by those, this +work sets out to put image-based SSAD under a larger lens and investigate the +role of data augmentation in SSAD. Through extensive experiments on 3 different +detector models and across 420 AD tasks, we provide comprehensive numerical and +visual evidences that the alignment between data augmentation and +anomaly-generating mechanism is the key to the success of SSAD, and in the lack +thereof, SSL may even impair accuracy. To the best of our knowledge, this is +the first meta-analysis on the role of data augmentation in SSAD. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Tackling Provably Hard Representative Selection via Graph Neural + Networks + + +
+ Representative Selection (RS) is the problem of finding a small subset of +exemplars from a dataset that is representative of the dataset. In this paper, +we study RS for attributed graphs, and focus on finding representative nodes +that optimize the accuracy of a model trained on the selected representatives. +Theoretically, we establish a new hardness result forRS (in the absence of a +graph structure) by proving that a particular, highly practical variant of it +(RS for Learning) is hard to approximate in polynomial time within any +reasonable factor, which implies a significant potential gap between the +optimum solution of widely-used surrogate functions and the actual accuracy of +the model. We then study the setting where a (homophilous) graph structure is +available, or can be constructed, between the data points.We show that with an +appropriate modeling approach, the presence of such a structure can turn a hard +RS (for learning) problem into one that can be effectively solved. To this end, +we develop RS-GNN, a representation learning-based RS model based on Graph +Neural Networks. Empirically, we demonstrate the effectiveness of RS-GNN on +problems with predefined graph structures as well as problems with graphs +induced from node feature similarities, by showing that RS-GNN achieves +significant improvements over established baselines on a suite of eight +benchmarks. + +
+
+ comment: Accepted at the Transactions of Machine Learning Research (TMLR) + Journal +
+
+
+
+
+ + ♻ ☆ Towards the Sparseness of Projection Head in Self-Supervised Learning + + +
+ In recent years, self-supervised learning (SSL) has emerged as a promising +approach for extracting valuable representations from unlabeled data. One +successful SSL method is contrastive learning, which aims to bring positive +examples closer while pushing negative examples apart. Many current contrastive +learning approaches utilize a parameterized projection head. Through a +combination of empirical analysis and theoretical investigation, we provide +insights into the internal mechanisms of the projection head and its +relationship with the phenomenon of dimensional collapse. Our findings +demonstrate that the projection head enhances the quality of representations by +performing contrastive loss in a projected subspace. Therefore, we propose an +assumption that only a subset of features is necessary when minimizing the +contrastive loss of a mini-batch of data. Theoretical analysis further suggests +that a sparse projection head can enhance generalization, leading us to +introduce SparseHead - a regularization term that effectively constrains the +sparsity of the projection head, and can be seamlessly integrated with any +self-supervised learning (SSL) approaches. Our experimental results validate +the effectiveness of SparseHead, demonstrating its ability to improve the +performance of existing contrastive methods. + +
+
+ comment: 9 pages,3 figures +
+
+
+
+
+ + ♻ ☆ On sampling determinantal and Pfaffian point processes on a quantum + computer + + +
+ DPPs were introduced by Macchi as a model in quantum optics the 1970s. Since +then, they have been widely used as models and subsampling tools in statistics +and computer science. Most applications require sampling from a DPP, and given +their quantum origin, it is natural to wonder whether sampling a DPP on a +quantum computer is easier than on a classical one. We focus here on DPPs over +a finite state space, which are distributions over the subsets of +$\{1,\dots,N\}$ parametrized by an $N\times N$ Hermitian kernel matrix. Vanilla +sampling consists in two steps, of respective costs $\mathcal{O}(N^3)$ and +$\mathcal{O}(Nr^2)$ operations on a classical computer, where $r$ is the rank +of the kernel matrix. A large first part of the current paper consists in +explaining why the state-of-the-art in quantum simulation of fermionic systems +already yields quantum DPP sampling algorithms. We then modify existing quantum +circuits, and discuss their insertion in a full DPP sampling pipeline that +starts from practical kernel specifications. The bottom line is that, with $P$ +(classical) parallel processors, we can divide the preprocessing cost by $P$ +and build a quantum circuit with $\mathcal{O}(Nr)$ gates that sample a given +DPP, with depth varying from $\mathcal{O}(N)$ to $\mathcal{O}(r\log N)$ +depending on qubit-communication constraints on the target machine. We also +connect existing work on the simulation of superconductors to Pfaffian point +processes, which generalize DPPs and would be a natural addition to the machine +learner's toolbox. Finally, the circuits are empirically validated on a +classical simulator and on 5-qubit machines. + +
+
+ comment: 48 pages, 8 figures. Additional results about parity of cardinality + of PfPP samples +
+
+
+
+
+ + ♻ ☆ M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models + and Latent Space Geometry Optimization MICCAI 2023 + + +
+ Medical vision-language models enable co-learning and integrating features +from medical imaging and clinical text. However, these models are not easy to +train and the latent representation space can be complex. Here we propose a +novel way for pre-training and regularising medical vision-language models. The +proposed method, named Medical vision-language pre-training with Frozen +language models and Latent spAce Geometry optimization (M-FLAG), leverages a +frozen language model for training stability and efficiency and introduces a +novel orthogonality loss to harmonize the latent space geometry. We demonstrate +the potential of the pre-trained model on three downstream tasks: medical image +classification, segmentation, and object detection. Extensive experiments +across five public datasets demonstrate that M-FLAG significantly outperforms +existing medical vision-language pre-training approaches and reduces the number +of parameters by 78\%. Notably, M-FLAG achieves outstanding performance on the +segmentation task while using only 1\% of the RSNA dataset, even outperforming +ImageNet pre-trained models that have been fine-tuned using 100\% of the data. + +
+
+ comment: Accepted by MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Generalization Error Bounds for Noisy, Iterative Algorithms via Maximal + Leakage + + +
+ We adopt an information-theoretic framework to analyze the generalization +behavior of the class of iterative, noisy learning algorithms. This class is +particularly suitable for study under information-theoretic metrics as the +algorithms are inherently randomized, and it includes commonly used algorithms +such as Stochastic Gradient Langevin Dynamics (SGLD). Herein, we use the +maximal leakage (equivalently, the Sibson mutual information of order infinity) +metric, as it is simple to analyze, and it implies both bounds on the +probability of having a large generalization error and on its expected value. +We show that, if the update function (e.g., gradient) is bounded in $L_2$-norm +and the additive noise is isotropic Gaussian noise, then one can obtain an +upper-bound on maximal leakage in semi-closed form. Furthermore, we demonstrate +how the assumptions on the update function affect the optimal (in the sense of +minimizing the induced maximal leakage) choice of the noise. Finally, we +compute explicit tight upper bounds on the induced maximal leakage for other +scenarios of interest. + +
+
+ comment: Updated to fix an error in Theorem 4 (asymptotic analysis) +
+
+
+
+
+ + ♻ ☆ Revisiting Softmax for Uncertainty Approximation in Text Classification + + +
+ Uncertainty approximation in text classification is an important area with +applications in domain adaptation and interpretability. One of the most widely +used uncertainty approximation methods is Monte Carlo (MC) Dropout, which is +computationally expensive as it requires multiple forward passes through the +model. A cheaper alternative is to simply use the softmax based on a single +forward pass without dropout to estimate model uncertainty. However, prior work +has indicated that these predictions tend to be overconfident. In this paper, +we perform a thorough empirical analysis of these methods on five datasets with +two base neural architectures in order to identify the trade-offs between the +two. We compare both softmax and an efficient version of MC Dropout on their +uncertainty approximations and downstream text classification performance, +while weighing their runtime (cost) against performance (benefit). We find +that, while MC dropout produces the best uncertainty approximations, using a +simple softmax leads to competitive and in some cases better uncertainty +estimation for text classification at a much lower computational cost, +suggesting that softmax can in fact be a sufficient uncertainty estimate when +computational resources are a concern. + +
+
+
+
+
+ + ♻ ☆ BOF-UCB: A Bayesian-Optimistic Frequentist Algorithm for Non-Stationary + Contextual Bandits + + +
+ We propose a novel Bayesian-Optimistic Frequentist Upper Confidence Bound +(BOF-UCB) algorithm for stochastic contextual linear bandits in non-stationary +environments. This unique combination of Bayesian and frequentist principles +enhances adaptability and performance in dynamic settings. The BOF-UCB +algorithm utilizes sequential Bayesian updates to infer the posterior +distribution of the unknown regression parameter, and subsequently employs a +frequentist approach to compute the Upper Confidence Bound (UCB) by maximizing +the expected reward over the posterior distribution. We provide theoretical +guarantees of BOF-UCB's performance and demonstrate its effectiveness in +balancing exploration and exploitation on synthetic datasets and classical +control tasks in a reinforcement learning setting. Our results show that +BOF-UCB outperforms existing methods, making it a promising solution for +sequential decision-making in non-stationary environments. + +
+
+
+
+
+ + ♻ ☆ AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for + Survival Outcome Prediction from PET/CT Images + + +
+ Survival prediction is a major concern for cancer management. Deep survival +models based on deep learning have been widely adopted to perform end-to-end +survival prediction from medical images. Recent deep survival models achieved +promising performance by jointly performing tumor segmentation with survival +prediction, where the models were guided to extract tumor-related information +through Multi-Task Learning (MTL). However, these deep survival models have +difficulties in exploring out-of-tumor prognostic information. In addition, +existing deep survival models are unable to effectively leverage multi-modality +images. Empirically-designed fusion strategies were commonly adopted to fuse +multi-modality information via task-specific manually-designed networks, thus +limiting the adaptability to different scenarios. In this study, we propose an +Adaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival +prediction from PET/CT images. Instead of adopting MTL, we propose a novel +Segmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained +for tumor segmentation and survival prediction sequentially in two stages. This +strategy enables the AdaMSS to focus on tumor regions in the first stage and +gradually expand its focus to include other prognosis-related regions in the +second stage. We also propose a data-driven strategy to fuse multi-modality +information, which realizes adaptive optimization of fusion strategies based on +training data during training. With the SSL and data-driven fusion strategies, +our AdaMSS is designed as an adaptive model that can self-adapt its focus +regions and fusion strategy for different training stages. Extensive +experiments with two large clinical datasets show that our AdaMSS outperforms +state-of-the-art survival prediction methods. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Alpha-divergence Variational Inference Meets Importance Weighted + Auto-Encoders: Methodology and Asymptotics + + +
+ Several algorithms involving the Variational R\'enyi (VR) bound have been +proposed to minimize an alpha-divergence between a target posterior +distribution and a variational distribution. Despite promising empirical +results, those algorithms resort to biased stochastic gradient descent +procedures and thus lack theoretical guarantees. In this paper, we formalize +and study the VR-IWAE bound, a generalization of the Importance Weighted +Auto-Encoder (IWAE) bound. We show that the VR-IWAE bound enjoys several +desirable properties and notably leads to the same stochastic gradient descent +procedure as the VR bound in the reparameterized case, but this time by relying +on unbiased gradient estimators. We then provide two complementary theoretical +analyses of the VR-IWAE bound and thus of the standard IWAE bound. Those +analyses shed light on the benefits or lack thereof of these bounds. Lastly, we +illustrate our theoretical claims over toy and real-data examples. + +
+
+
+
+
+ + ♻ ☆ MixPath: A Unified Approach for One-shot Neural Architecture Search ICCV2023 + + +
+ Blending multiple convolutional kernels is proved advantageous in neural +architecture design. However, current two-stage neural architecture search +methods are mainly limited to single-path search spaces. How to efficiently +search models of multi-path structures remains a difficult problem. In this +paper, we are motivated to train a one-shot multi-path supernet to accurately +evaluate the candidate architectures. Specifically, we discover that in the +studied search spaces, feature vectors summed from multiple paths are nearly +multiples of those from a single path. Such disparity perturbs the supernet +training and its ranking ability. Therefore, we propose a novel mechanism +called Shadow Batch Normalization (SBN) to regularize the disparate feature +statistics. Extensive experiments prove that SBNs are capable of stabilizing +the optimization and improving ranking performance. We call our unified +multi-path one-shot approach as MixPath, which generates a series of models +that achieve state-of-the-art results on ImageNet. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ LongNet: Scaling Transformers to 1,000,000,000 Tokens + + +
+ Scaling sequence length has become a critical demand in the era of large +language models. However, existing methods struggle with either computational +complexity or model expressivity, rendering the maximum sequence length +restricted. To address this issue, we introduce LongNet, a Transformer variant +that can scale sequence length to more than 1 billion tokens, without +sacrificing the performance on shorter sequences. Specifically, we propose +dilated attention, which expands the attentive field exponentially as the +distance grows. LongNet has significant advantages: 1) it has a linear +computation complexity and a logarithm dependency between any two tokens in a +sequence; 2) it can be served as a distributed trainer for extremely long +sequences; 3) its dilated attention is a drop-in replacement for standard +attention, which can be seamlessly integrated with the existing +Transformer-based optimization. Experiments results demonstrate that LongNet +yields strong performance on both long-sequence modeling and general language +tasks. Our work opens up new possibilities for modeling very long sequences, +e.g., treating a whole corpus or even the entire Internet as a sequence. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ The Meta-Evaluation Problem in Explainable AI: Identifying Reliable + Estimators with MetaQuantus + + +
+ One of the unsolved challenges in the field of Explainable AI (XAI) is +determining how to most reliably estimate the quality of an explanation method +in the absence of ground truth explanation labels. Resolving this issue is of +utmost importance as the evaluation outcomes generated by competing evaluation +methods (or ''quality estimators''), which aim at measuring the same property +of an explanation method, frequently present conflicting rankings. Such +disagreements can be challenging for practitioners to interpret, thereby +complicating their ability to select the best-performing explanation method. We +address this problem through a meta-evaluation of different quality estimators +in XAI, which we define as ''the process of evaluating the evaluation method''. +Our novel framework, MetaQuantus, analyses two complementary performance +characteristics of a quality estimator: its resilience to noise and reactivity +to randomness, thus circumventing the need for ground truth labels. We +demonstrate the effectiveness of our framework through a series of experiments, +targeting various open questions in XAI such as the selection and +hyperparameter optimisation of quality estimators. Our work is released under +an open-source license (https://github.com/annahedstroem/MetaQuantus) to serve +as a development tool for XAI- and Machine Learning (ML) practitioners to +verify and benchmark newly constructed quality estimators in a given +explainability context. With this work, we provide the community with clear and +theoretically-grounded guidance for identifying reliable evaluation methods, +thus facilitating reproducibility in the field. + +
+
+ comment: 35 pages, 15 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Multimodal brain age estimation using interpretable adaptive + population-graph learning MICCAI 2023 + + +
+ Brain age estimation is clinically important as it can provide valuable +information in the context of neurodegenerative diseases such as Alzheimer's. +Population graphs, which include multimodal imaging information of the subjects +along with the relationships among the population, have been used in literature +along with Graph Convolutional Networks (GCNs) and have proved beneficial for a +variety of medical imaging tasks. A population graph is usually static and +constructed manually using non-imaging information. However, graph construction +is not a trivial task and might significantly affect the performance of the +GCN, which is inherently very sensitive to the graph structure. In this work, +we propose a framework that learns a population graph structure optimized for +the downstream task. An attention mechanism assigns weights to a set of imaging +and non-imaging features (phenotypes), which are then used for edge extraction. +The resulting graph is used to train the GCN. The entire pipeline can be +trained end-to-end. Additionally, by visualizing the attention weights that +were the most important for the graph construction, we increase the +interpretability of the graph. We use the UK Biobank, which provides a large +variety of neuroimaging and non-imaging phenotypes, to evaluate our method on +brain age regression and classification. The proposed method outperforms +competing static graph approaches and other state-of-the-art adaptive methods. +We further show that the assigned attention scores indicate that there are both +imaging and non-imaging phenotypes that are informative for brain age +estimation and are in agreement with the relevant literature. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ IntelliGraphs: Datasets for Benchmarking Knowledge Graph Generation + + +
+ Knowledge Graph Embedding (KGE) models are used to learn continuous +representations of entities and relations. A key task in the literature is +predicting missing links between entities. However, Knowledge Graphs are not +just sets of links but also have semantics underlying their structure. +Semantics is crucial in several downstream tasks, such as query answering or +reasoning. We introduce the subgraph inference task, where a model has to +generate likely and semantically valid subgraphs. We propose IntelliGraphs, a +set of five new Knowledge Graph datasets. The IntelliGraphs datasets contain +subgraphs with semantics expressed in logical rules for evaluating subgraph +inference. We also present the dataset generator that produced the synthetic +datasets. We designed four novel baseline models, which include three models +based on traditional KGEs. We evaluate their expressiveness and show that these +models cannot capture the semantics. We believe this benchmark will encourage +the development of machine learning models that emphasize semantic +understanding. + +
+
+
+
+
+ + ♻ ☆ CB-HVTNet: A channel-boosted hybrid vision transformer network for + lymphocyte assessment in histopathological images + + +
+ Transformers, due to their ability to learn long range dependencies, have +overcome the shortcomings of convolutional neural networks (CNNs) for global +perspective learning. Therefore, they have gained the focus of researchers for +several vision related tasks including medical diagnosis. However, their +multi-head attention module only captures global level feature representations, +which is insufficient for medical images. To address this issue, we propose a +Channel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning +to generate boosted channels and employs both transformers and CNNs to analyse +lymphocytes in histopathological images. The proposed CB HVT comprises five +modules, including a channel generation module, channel exploitation module, +channel merging module, region-aware module, and a detection and segmentation +head, which work together to effectively identify lymphocytes. The channel +generation module uses the idea of channel boosting through transfer learning +to extract diverse channels from different auxiliary learners. In the CB HVT, +these boosted channels are first concatenated and ranked using an attention +mechanism in the channel exploitation module. A fusion block is then utilized +in the channel merging module for a gradual and systematic merging of the +diverse boosted channels to improve the network's learning representations. The +CB HVT also employs a proposal network in its region aware module and a head to +effectively identify objects, even in overlapping regions and with artifacts. +We evaluated the proposed CB HVT on two publicly available datasets for +lymphocyte assessment in histopathological images. The results show that CB HVT +outperformed other state of the art detection models, and has good +generalization ability, demonstrating its value as a tool for pathologists. + +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial + Transferability From Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21 + pages, 12 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Does Circuit Analysis Interpretability Scale? Evidence from Multiple + Choice Capabilities in Chinchilla + + +
+ \emph{Circuit analysis} is a promising technique for understanding the +internal mechanisms of language models. However, existing analyses are done in +small models far from the state of the art. To address this, we present a case +study of circuit analysis in the 70B Chinchilla model, aiming to test the +scalability of circuit analysis. In particular, we study multiple-choice +question answering, and investigate Chinchilla's capability to identify the +correct answer \emph{label} given knowledge of the correct answer \emph{text}. +We find that the existing techniques of logit attribution, attention pattern +visualization, and activation patching naturally scale to Chinchilla, allowing +us to identify and categorize a small set of `output nodes' (attention heads +and MLPs). + We further study the `correct letter' category of attention heads aiming to +understand the semantics of their features, with mixed results. For normal +multiple-choice question answers, we significantly compress the query, key and +value subspaces of the head without loss of performance when operating on the +answer labels for multiple-choice questions, and we show that the query and key +subspaces represent an `Nth item in an enumeration' feature to at least some +extent. However, when we attempt to use this explanation to understand the +heads' behaviour on a more general distribution including randomized answer +labels, we find that it is only a partial explanation, suggesting there is more +to learn about the operation of `correct letter' heads on multiple choice +question answering. + +
+
+
+
+
+ + ♻ ☆ The RL Perceptron: Generalisation Dynamics of Policy Learning in High + Dimensions + + +
+ Reinforcement learning (RL) algorithms have proven transformative in a range +of domains. To tackle real-world domains, these systems often use neural +networks to learn policies directly from pixels or other high-dimensional +sensory input. By contrast, much theory of RL has focused on discrete state +spaces or worst-case analysis, and fundamental questions remain about the +dynamics of policy learning in high-dimensional settings. Here, we propose a +solvable high-dimensional model of RL that can capture a variety of learning +protocols, and derive its typical dynamics as a set of closed-form ordinary +differential equations (ODEs). We derive optimal schedules for the learning +rates and task difficulty - analogous to annealing schemes and curricula during +training in RL - and show that the model exhibits rich behaviour, including +delayed learning under sparse rewards; a variety of learning regimes depending +on reward baselines; and a speed-accuracy trade-off driven by reward +stringency. Experiments on variants of the Procgen game "Bossfight" and Arcade +Learning Environment game "Pong" also show such a speed-accuracy trade-off in +practice. Together, these results take a step towards closing the gap between +theory and practice in high-dimensional RL. + +
+
+ comment: 10 pages, 7 figures, Preprint +
+
+
+
+
+ + ♻ ☆ Network-GIANT: Fully distributed Newton-type optimization via harmonic + Hessian consensus + + +
+ This paper considers the problem of distributed multi-agent learning, where +the global aim is to minimize a sum of local objective (empirical loss) +functions through local optimization and information exchange between +neighbouring nodes. We introduce a Newton-type fully distributed optimization +algorithm, Network-GIANT, which is based on GIANT, a Federated learning +algorithm that relies on a centralized parameter server. The Network-GIANT +algorithm is designed via a combination of gradient-tracking and a Newton-type +iterative algorithm at each node with consensus based averaging of local +gradient and Newton updates. We prove that our algorithm guarantees semi-global +and exponential convergence to the exact solution over the network assuming +strongly convex and smooth loss functions. We provide empirical evidence of the +superior convergence performance of Network-GIANT over other state-of-art +distributed learning algorithms such as Network-DANE and Newton-Raphson +Consensus. + +
+
+
+
+
+ + ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed + Tomography via Deep Convolutional Neural Network based Artifact Reduction + + +
+ Purpose: Sparse-view computed tomography (CT) is an effective way to reduce +dose by lowering the total number of views acquired, albeit at the expense of +image quality, which, in turn, can impact the ability to detect diseases. We +explore deep learning-based artifact reduction in sparse-view cranial CT scans +and its impact on automated hemorrhage detection. Methods: We trained a U-Net +for artefact reduction on simulated sparse-view cranial CT scans from 3000 +patients obtained from a public dataset and reconstructed with varying levels +of sub-sampling. Additionally, we trained a convolutional neural network on +fully sampled CT data from 17,545 patients for automated hemorrhage detection. +We evaluated the classification performance using the area under the receiver +operator characteristic curves (AUC-ROCs) with corresponding 95% confidence +intervals (CIs) and the DeLong test, along with confusion matrices. The +performance of the U-Net was compared to an analytical approach based on total +variation (TV). Results: The U-Net performed superior compared to unprocessed +and TV-processed images with respect to image quality and automated hemorrhage +diagnosis. With U-Net post-processing, the number of views can be reduced from +4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973; +0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256 +views (0.967; 0.964-0.969) with a slight performance decrease (P<.001). +Conclusion: The results suggest that U-Net based artifact reduction +substantially enhances automated hemorrhage detection in sparse-view cranial +CTs. Our findings highlight that appropriate post-processing is crucial for +optimal image quality and diagnostic accuracy while minimizing radiation dose. + +
+
+ comment: 11 pages, 6 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Making Substitute Models More Bayesian Can Enhance Transferability of + Adversarial Examples ICLR 2023 + + +
+ The transferability of adversarial examples across deep neural networks +(DNNs) is the crux of many black-box attacks. Many prior efforts have been +devoted to improving the transferability via increasing the diversity in inputs +of some substitute models. In this paper, by contrast, we opt for the diversity +in substitute models and advocate to attack a Bayesian model for achieving +desirable transferability. Deriving from the Bayesian formulation, we develop a +principled strategy for possible finetuning, which can be combined with many +off-the-shelf Gaussian posterior approximations over DNN parameters. Extensive +experiments have been conducted to verify the effectiveness of our method, on +common benchmark datasets, and the results demonstrate that our method +outperforms recent state-of-the-arts by large margins (roughly 19% absolute +increase in average attack success rate on ImageNet), and, by combining with +these recent methods, further performance gain can be obtained. Our code: +https://github.com/qizhangli/MoreBayesian-attack. + +
+
+ comment: Accepted by ICLR 2023, fix typos +
+
+
+
+
+ + ♻ ☆ Can In-context Learners Learn a Reasoning Concept from Demonstrations? ACL 2023 + + +
+ Language models exhibit an emergent ability to learn a new task from a small +number of input-output demonstrations. However, recent work shows that +in-context learners largely rely on their pre-trained knowledge, such as the +sentiment of the labels, instead of learning new associations from the input. +We argue that the commonly-used few-shot evaluation using a random selection of +in-context demonstrations can not disentangle models' reliance on such biases, +as most of the randomly-selected demonstrations do not present relations +informative for prediction beyond exposing the task's input-output +distribution. + Therefore, to evaluate models' in-context learning ability independent of +models' memory, we introduce a Concept-sharing few-shot learning method +choosing the demonstrations that share an underlying concept with the predicted +sample. We extract a set of such concepts from available human explanations and +measure how much models can benefit from presenting these concepts in few-shot +demonstrations. + We find that most of the recent in-context learners can not consistently +benefit from the demonstrated concepts, irrespective of the model size. +However, we note that T0 models are more sensitive to exhibited concepts, +benefiting from concept-sharing demonstrations in 7 out of 8 evaluation +scenarios. + +
+
+ comment: Awarded Best Paper at ACL 2023 Natural Language Reasoning and + Structured Explanations (NLRSE) workshop +
+
+
+
+
+ + ♻ ☆ Alternately Optimized Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) have greatly advanced the semi-supervised node +classification task on graphs. The majority of existing GNNs are trained in an +end-to-end manner that can be viewed as tackling a bi-level optimization +problem. This process is often inefficient in computation and memory usage. In +this work, we propose a new optimization framework for semi-supervised learning +on graphs. The proposed framework can be conveniently solved by the alternating +optimization algorithms, resulting in significantly improved efficiency. +Extensive experiments demonstrate that the proposed method can achieve +comparable or better performance with state-of-the-art baselines while it has +significantly better computation and memory efficiency. + +
+
+
+
+
+ + ♻ ☆ Non-linear Embeddings in Hilbert Simplex Geometry + + +
+ A key technique of machine learning and computer vision is to embed discrete +weighted graphs into continuous spaces for further downstream processing. +Embedding discrete hierarchical structures in hyperbolic geometry has proven +very successful since it was shown that any weighted tree can be embedded in +that geometry with arbitrary low distortion. Various optimization methods for +hyperbolic embeddings based on common models of hyperbolic geometry have been +studied. In this paper, we consider Hilbert geometry for the standard simplex +which is isometric to a vector space equipped with the variation polytope norm. +We study the representation power of this Hilbert simplex geometry by embedding +distance matrices of graphs. Our findings demonstrate that Hilbert simplex +geometry is competitive to alternative geometries such as the Poincar\'e +hyperbolic ball or the Euclidean geometry for embedding tasks while being fast +and numerically robust. + +
+
+ comment: 19 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large + Language Models + + +
+ Large Language Models (LLMs), despite their recent impressive +accomplishments, are notably cost-prohibitive to deploy, particularly for +applications involving long-content generation, such as dialogue systems and +story writing. Often, a large amount of transient state information, referred +to as the KV cache, is stored in GPU memory in addition to model parameters, +scaling linearly with the sequence length and batch size. In this paper, we +introduce a novel approach for implementing the KV cache which significantly +reduces its memory footprint. Our approach is based on the noteworthy +observation that a small portion of tokens contributes most of the value when +computing attention scores. We call these tokens Heavy Hitters (H$_2$). Through +a comprehensive investigation, we find that (i) the emergence of H$_2$ is +natural and strongly correlates with the frequent co-occurrence of tokens in +the text, and (ii) removing them results in significant performance +degradation. Based on these insights, we propose Heavy Hitter Oracle (H$_2$O), +a KV cache eviction policy that dynamically retains a balance of recent and +H$_2$ tokens. We formulate the KV cache eviction as a dynamic submodular +problem and prove (under mild assumptions) a theoretical guarantee for our +novel eviction algorithm which could help guide future work. We validate the +accuracy of our algorithm with OPT, LLaMA, and GPT-NeoX across a wide range of +tasks. Our implementation of H$_2$O with 20% heavy hitters improves the +throughput over three leading inference systems DeepSpeed Zero-Inference, +Hugging Face Accelerate, and FlexGen by up to 29$\times$, 29$\times$, and +3$\times$ on OPT-6.7B and OPT-30B. With the same batch size, H2O can reduce the +latency by up to 1.9$\times$. The code is available at +https://github.com/FMInference/H2O. + +
+
+
+
+
+ + ♻ ☆ Retentive Network: A Successor to Transformer for Large Language Models + + +
+ In this work, we propose Retentive Network (RetNet) as a foundation +architecture for large language models, simultaneously achieving training +parallelism, low-cost inference, and good performance. We theoretically derive +the connection between recurrence and attention. Then we propose the retention +mechanism for sequence modeling, which supports three computation paradigms, +i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel +representation allows for training parallelism. The recurrent representation +enables low-cost $O(1)$ inference, which improves decoding throughput, latency, +and GPU memory without sacrificing performance. The chunkwise recurrent +representation facilitates efficient long-sequence modeling with linear +complexity, where each chunk is encoded parallelly while recurrently +summarizing the chunks. Experimental results on language modeling show that +RetNet achieves favorable scaling results, parallel training, low-cost +deployment, and efficient inference. The intriguing properties make RetNet a +strong successor to Transformer for large language models. Code will be +available at https://aka.ms/retnet. + +
+
+
+
+
+ + ♻ ☆ Graph Positional Encoding via Random Feature Propagation ICML 2023 + + +
+ Two main families of node feature augmentation schemes have been explored for +enhancing GNNs: random features and spectral positional encoding. Surprisingly, +however, there is still no clear understanding of the relation between these +two augmentation schemes. Here we propose a novel family of positional encoding +schemes which draws a link between the above two approaches and improves over +both. The new approach, named Random Feature Propagation (RFP), is inspired by +the power iteration method and its generalizations. It concatenates several +intermediate steps of an iterative algorithm for computing the dominant +eigenvectors of a propagation matrix, starting from random node features. +Notably, these propagation steps are based on graph-dependent propagation +operators that can be either predefined or learned. We explore the theoretical +and empirical benefits of RFP. First, we provide theoretical justifications for +using random features, for incorporating early propagation steps, and for using +multiple random initializations. Then, we empirically demonstrate that RFP +significantly outperforms both spectral PE and random features in multiple node +classification and graph classification benchmarks. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ♻ ☆ Solvent: A Framework for Protein Folding + + +
+ Consistency and reliability are crucial for conducting AI research. Many +famous research fields, such as object detection, have been compared and +validated with solid benchmark frameworks. After AlphaFold2, the protein +folding task has entered a new phase, and many methods are proposed based on +the component of AlphaFold2. The importance of a unified research framework in +protein folding contains implementations and benchmarks to consistently and +fairly compare various approaches. To achieve this, we present Solvent, an +protein folding framework that supports significant components of +state-of-th-arts models in the manner of off-the-shelf interface Solvent +contains different models implemented in a unified codebase and supports +training and evaluation for defined models on the same dataset. We benchmark +well-known algorithms and their components and provide experiments that give +helpful insights into the protein structure modeling field. We hope that +Solvent will increase the reliability and consistency of proposed models and +gives efficiency in both speed and costs, resulting in acceleration on protein +folding modeling research. The code is available at +https://github.com/kakaobrain/solvent, and the project will continue to be +developed. + +
+
+ comment: preprint, 8pages +
+
+
+
+
+ + ♻ ☆ Off-Policy Average Reward Actor-Critic with Deterministic Policy Search ICML 2023 + + +
+ The average reward criterion is relatively less studied as most existing +works in the Reinforcement Learning literature consider the discounted reward +criterion. There are few recent works that present on-policy average reward +actor-critic algorithms, but average reward off-policy actor-critic is +relatively less explored. In this work, we present both on-policy and +off-policy deterministic policy gradient theorems for the average reward +performance criterion. Using these theorems, we also present an Average Reward +Off-Policy Deep Deterministic Policy Gradient (ARO-DDPG) Algorithm. We first +show asymptotic convergence analysis using the ODE-based method. Subsequently, +we provide a finite time analysis of the resulting stochastic approximation +scheme with linear function approximator and obtain an $\epsilon$-optimal +stationary policy with a sample complexity of $\Omega(\epsilon^{-2.5})$. We +compare the average reward performance of our proposed ARO-DDPG algorithm and +observe better empirical performance compared to state-of-the-art on-policy +average reward actor-critic algorithms over MuJoCo-based environments. + +
+
+ comment: Accepted at ICML 2023 +
+
+
+
+
+ + ♻ ☆ Trustworthy Recommender Systems + + +
+ Recommender systems (RSs) aim to help users to effectively retrieve items of +their interests from a large catalogue. For a quite long period of time, +researchers and practitioners have been focusing on developing accurate RSs. +Recent years have witnessed an increasing number of threats to RSs, coming from +attacks, system and user generated noise, system bias. As a result, it has +become clear that a strict focus on RS accuracy is limited and the research +must consider other important factors, e.g., trustworthiness. For end users, a +trustworthy RS (TRS) should not only be accurate, but also transparent, +unbiased and fair as well as robust to noise or attacks. These observations +actually led to a paradigm shift of the research on RSs: from accuracy-oriented +RSs to TRSs. However, researchers lack a systematic overview and discussion of +the literature in this novel and fast developing field of TRSs. To this end, in +this paper, we provide an overview of TRSs, including a discussion of the +motivation and basic concepts of TRSs, a presentation of the challenges in +building TRSs, and a perspective on the future directions in this area. We also +provide a novel conceptual framework to support the construction of TRSs. + +
+
+
+
+
+ + ♻ ☆ SwinGNN: Rethinking Permutation Invariance in Diffusion Models for Graph + Generation + + +
+ Diffusion models based on permutation-equivariant networks can learn +permutation-invariant distributions for graph data. However, in comparison to +their non-invariant counterparts, we have found that these invariant models +encounter greater learning challenges since 1) their effective target +distributions exhibit more modes; 2) their optimal one-step denoising scores +are the score functions of Gaussian mixtures with more components. Motivated by +this analysis, we propose a non-invariant diffusion model, called +$\textit{SwinGNN}$, which employs an efficient edge-to-edge 2-WL message +passing network and utilizes shifted window based self-attention inspired by +SwinTransformers. Further, through systematic ablations, we identify several +critical training and sampling techniques that significantly improve the sample +quality of graph generation. At last, we introduce a simple post-processing +trick, $\textit{i.e.}$, randomly permuting the generated graphs, which provably +converts any graph generative model to a permutation-invariant one. Extensive +experiments on synthetic and real-world protein and molecule datasets show that +our SwinGNN achieves state-of-the-art performances. Our code is released at +https://github.com/qiyan98/SwinGNN. + +
+
+
+
+
+ + ♻ ☆ Meta-Learning Parameterized Skills + + +
+ We propose a novel parameterized skill-learning algorithm that aims to learn +transferable parameterized skills and synthesize them into a new action space +that supports efficient learning in long-horizon tasks. We propose to leverage +off-policy Meta-RL combined with a trajectory-centric smoothness term to learn +a set of parameterized skills. Our agent can use these learned skills to +construct a three-level hierarchical framework that models a +Temporally-extended Parameterized Action Markov Decision Process. We +empirically demonstrate that the proposed algorithms enable an agent to solve a +set of difficult long-horizon (obstacle-course and robot manipulation) tasks. + +
+
+
+
+
+ + ♻ ☆ AlpaServe: Statistical Multiplexing with Model Parallelism for Deep + Learning Serving OSDI 2023 + + +
+ Model parallelism is conventionally viewed as a method to scale a single +large deep learning model beyond the memory limits of a single device. In this +paper, we demonstrate that model parallelism can be additionally used for the +statistical multiplexing of multiple devices when serving multiple models, even +when a single model can fit into a single device. Our work reveals a +fundamental trade-off between the overhead introduced by model parallelism and +the opportunity to exploit statistical multiplexing to reduce serving latency +in the presence of bursty workloads. We explore the new trade-off space and +present a novel serving system, AlpaServe, that determines an efficient +strategy for placing and parallelizing collections of large deep learning +models across a distributed cluster. Evaluation results on production workloads +show that AlpaServe can process requests at up to 10x higher rates or 6x more +burstiness while staying within latency constraints for more than 99% of +requests. + +
+
+ comment: OSDI 2023 +
+
+
+
+
+ + ♻ ☆ A Conceptual Model for End-to-End Causal Discovery in Knowledge Tracing + + +
+ In this paper, we take a preliminary step towards solving the problem of +causal discovery in knowledge tracing, i.e., finding the underlying causal +relationship among different skills from real-world student response data. This +problem is important since it can potentially help us understand the causal +relationship between different skills without extensive A/B testing, which can +potentially help educators to design better curricula according to skill +prerequisite information. Specifically, we propose a conceptual solution, a +novel causal gated recurrent unit (GRU) module in a modified deep knowledge +tracing model, which uses i) a learnable permutation matrix for causal ordering +among skills and ii) an optionally learnable lower-triangular matrix for causal +structure among skills. We also detail how to learn the model parameters in an +end-to-end, differentiable way. Our solution placed among the top entries in +Task 3 of the NeurIPS 2022 Challenge on Causal Insights for Learning Paths in +Education. We detail preliminary experiments as evaluated on the challenge's +public leaderboard since the ground truth causal structure has not been +publicly released, making detailed local evaluation impossible. + +
+
+ comment: 16th International Conference on Educational Data Mining (EDM 2023) +
+
+
+
+
+ + ♻ ☆ Outline, Then Details: Syntactically Guided Coarse-To-Fine Code + Generation ICML 2023 + + +
+ For a complicated algorithm, its implementation by a human programmer usually +starts with outlining a rough control flow followed by iterative enrichments, +eventually yielding carefully generated syntactic structures and variables in a +hierarchy. However, state-of-the-art large language models generate codes in a +single pass, without intermediate warm-ups to reflect the structured thought +process of "outline-then-detail". Inspired by the recent success of +chain-of-thought prompting, we propose ChainCoder, a program synthesis language +model that generates Python code progressively, i.e. from coarse to fine in +multiple passes. We first decompose source code into layout frame components +and accessory components via abstract syntax tree parsing to construct a +hierarchical representation. We then reform our prediction target into a +multi-pass objective, each pass generates a subsequence, which is concatenated +in the hierarchy. Finally, a tailored transformer architecture is leveraged to +jointly encode the natural language descriptions and syntactically aligned I/O +data samples. Extensive evaluations show that ChainCoder outperforms +state-of-the-arts, demonstrating that our progressive generation eases the +reasoning procedure and guides the language model to generate higher-quality +solutions. Our codes are available at: +https://github.com/VITA-Group/ChainCoder. + +
+
+ comment: Accepted in ICML 2023 +
+
+
+
+
+ + ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution + Generalizability ICCV + + +
+ Large vision-language models have achieved outstanding performance, but their +size and computational requirements make their deployment on +resource-constrained devices and time-sensitive tasks impractical. Model +distillation, the process of creating smaller, faster models that maintain the +performance of larger models, is a promising direction towards the solution. +This paper investigates the distillation of visual representations in large +teacher vision-language models into lightweight student models using a small- +or mid-scale dataset. Notably, this study focuses on open-vocabulary +out-of-distribution (OOD) generalization, a challenging problem that has been +overlooked in previous model distillation literature. We propose two principles +from vision and language modality perspectives to enhance student's OOD +generalization: (1) by better imitating teacher's visual representation space, +and carefully promoting better coherence in vision-language alignment with the +teacher; (2) by enriching the teacher's language representations with +informative and finegrained semantic attributes to effectively distinguish +between different labels. We propose several metrics and conduct extensive +experiments to investigate their techniques. The results demonstrate +significant improvements in zero-shot and few-shot student performance on +open-vocabulary out-of-distribution classification, highlighting the +effectiveness of our proposed approaches. Code released at +https://github.com/xuanlinli17/large_vlm_distillation_ood + +
+
+ comment: Published at International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ Invariant Aggregator for Defending against Federated Backdoor Attacks + + +
+ Federated learning is gaining popularity as it enables training high-utility +models across several clients without directly sharing their private data. As a +downside, the federated setting makes the model vulnerable to various +adversarial attacks in the presence of malicious clients. Despite the +theoretical and empirical success in defending against attacks that aim to +degrade models' utility, defense against backdoor attacks that increase model +accuracy on backdoor samples exclusively without hurting the utility on other +samples remains challenging. To this end, we first analyze the vulnerability of +federated learning to backdoor attacks over a flat loss landscape which is +common for well-designed neural networks such as Resnet [He et al., 2015] but +is often overlooked by previous works. Over a flat loss landscape, misleading +federated learning models to exclusively benefit malicious clients with +backdoor samples do not require a significant difference between malicious and +benign client-wise updates, making existing defenses insufficient. In contrast, +we propose an invariant aggregator that redirects the aggregated update to +invariant directions that are generally useful via selectively masking out the +gradient elements that favor few and possibly malicious clients regardless of +the difference magnitude. Theoretical results suggest that our approach +provably mitigates backdoor attacks over both flat and sharp loss landscapes. +Empirical results on three datasets with different modalities and varying +numbers of clients further demonstrate that our approach mitigates a broad +class of backdoor attacks with a negligible cost on the model utility. + +
+
+
+
+
+ + ♻ ☆ Representing Random Utility Choice Models with Neural Networks + + +
+ Motivated by the successes of deep learning, we propose a class of neural +network-based discrete choice models, called RUMnets, inspired by the random +utility maximization (RUM) framework. This model formulates the agents' random +utility function using a sample average approximation. We show that RUMnets +sharply approximate the class of RUM discrete choice models: any model derived +from random utility maximization has choice probabilities that can be +approximated arbitrarily closely by a RUMnet. Reciprocally, any RUMnet is +consistent with the RUM principle. We derive an upper bound on the +generalization error of RUMnets fitted on choice data, and gain theoretical +insights on their ability to predict choices on new, unseen data depending on +critical parameters of the dataset and architecture. By leveraging open-source +libraries for neural networks, we find that RUMnets are competitive against +several choice modeling and machine learning methods in terms of predictive +accuracy on two real-world datasets. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ TbExplain: A Text-based Explanation Method for Scene Classification + Models with the Statistical Prediction Correction + + +
+ The field of Explainable Artificial Intelligence (XAI) aims to improve the +interpretability of black-box machine learning models. Building a heatmap based +on the importance value of input features is a popular method for explaining +the underlying functions of such models in producing their predictions. +Heatmaps are almost understandable to humans, yet they are not without flaws. +Non-expert users, for example, may not fully understand the logic of heatmaps +(the logic in which relevant pixels to the model's prediction are highlighted +with different intensities or colors). Additionally, objects and regions of the +input image that are relevant to the model prediction are frequently not +entirely differentiated by heatmaps. In this paper, we propose a framework +called TbExplain that employs XAI techniques and a pre-trained object detector +to present text-based explanations of scene classification models. Moreover, +TbExplain incorporates a novel method to correct predictions and textually +explain them based on the statistics of objects in the input image when the +initial prediction is unreliable. To assess the trustworthiness and validity of +the text-based explanations, we conducted a qualitative experiment, and the +findings indicated that these explanations are sufficiently reliable. +Furthermore, our quantitative and qualitative experiments on TbExplain with +scene classification datasets reveal an improvement in classification accuracy +over ResNet variants. + +
+
+
+
+
+ + ☆ AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point + Clouds of Deformable Objects + + +
+ This paper focuses on motion prediction for point cloud sequences in the +challenging case of deformable 3D objects, such as human body motion. First, we +investigate the challenges caused by deformable shapes and complex motions +present in this type of representation, with the ultimate goal of understanding +the technical limitations of state-of-the-art models. From this understanding, +we propose an improved architecture for point cloud prediction of deformable 3D +objects. Specifically, to handle deformable shapes, we propose a graph-based +approach that learns and exploits the spatial structure of point clouds to +extract more representative features. Then we propose a module able to combine +the learned features in an adaptative manner according to the point cloud +movements. The proposed adaptative module controls the composition of local and +global motions for each point, enabling the network to model complex motions in +deformable 3D objects more effectively. We tested the proposed method on the +following datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG +and CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that +our method outperforms the current baseline methods given its improved ability +to model complex movements as well as preserve point cloud shape. Furthermore, +we demonstrate the generalizability of the proposed framework for dynamic +feature learning, by testing the framework for action recognition on the +MSRAction3D dataset and achieving results on-par with state-of-the-art methods + +
+
+
+
+
+ + ☆ Embedded Heterogeneous Attention Transformer for Cross-lingual Image + Captioning + + +
+ Cross-lingual image captioning is confronted with both cross-lingual and +cross-modal challenges for multimedia analysis. The crucial issue in this task +is to model the global and local matching between the image and different +languages. Existing cross-modal embedding methods based on Transformer +architecture oversight the local matching between the image region and +monolingual words, not to mention in the face of a variety of differentiated +languages. Due to the heterogeneous property of the cross-modal and +cross-lingual task, we utilize the heterogeneous network to establish +cross-domain relationships and the local correspondences between the image and +different languages. In this paper, we propose an Embedded Heterogeneous +Attention Transformer (EHAT) to build reasoning paths bridging cross-domain for +cross-lingual image captioning and integrate into transformer. The proposed +EHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous +Attention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN +as the core network, models and infers cross-domain relationship anchored by +vision bounding box representation features to connect two languages word +features and learn the heterogeneous maps. MHCA and HCA implement cross-domain +integration in the encoder through the special heterogeneous attention and +enable single model to generate two language captioning. We test on MSCOCO +dataset to generate English and Chinese, which are most widely used and have +obvious difference between their language families. Our experiments show that +our method even achieve better than advanced monolingual methods. + +
+
+
+
+
+ + ☆ Hierarchical Semantic Perceptual Listener Head Video Generation: A + High-performance Pipeline ACM MM 2023 + + +
+ In dyadic speaker-listener interactions, the listener's head reactions along +with the speaker's head movements, constitute an important non-verbal semantic +expression together. The listener Head generation task aims to synthesize +responsive listener's head videos based on audios of the speaker and reference +images of the listener. Compared to the Talking-head generation, it is more +challenging to capture the correlation clues from the speaker's audio and +visual information. Following the ViCo baseline scheme, we propose a +high-performance solution by enhancing the hierarchical semantic extraction +capability of the audio encoder module and improving the decoder part, renderer +and post-processing modules. Our solution gets the first place on the official +leaderboard for the track of listening head generation. This paper is a +technical report of ViCo@2023 Conversational Head Generation Challenge in ACM +Multimedia 2023 conference. + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ☆ NTIRE 2023 Quality Assessment of Video Enhancement Challenge + + +
+ This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement +Challenge, which will be held in conjunction with the New Trends in Image +Restoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to +address a major challenge in the field of video processing, namely, video +quality assessment (VQA) for enhanced videos. The challenge uses the VQA +Dataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211 +enhanced videos, including 600 videos with color, brightness, and contrast +enhancements, 310 videos with deblurring, and 301 deshaked videos. The +challenge has a total of 167 registered participants. 61 participating teams +submitted their prediction results during the development phase, with a total +of 3168 submissions. A total of 176 submissions were submitted by 37 +participating teams during the final testing phase. Finally, 19 participating +teams submitted their models and fact sheets, and detailed the methods they +used. Some methods have achieved better results than baseline methods, and the +winning methods have demonstrated superior prediction performance. + +
+
+
+
+
+ + ☆ Estudio de la Experiencia de Usuario mediante un Sistema de Dashboards + de Análisis de Aprendizaje Multimodal + + +
+ In the article, we present a Web-based System called M2LADS, which supports +the integration and visualization of multimodal data recorded in user +experiences (UX) in a Learning Analytics (LA) system in the form of Web-based +Dashboards. Based on the edBB platform, the multimodal data gathered contains +biometric and behavioral signals including electroencephalogram data to measure +learners' cognitive attention, heart rate for affective measures and visual +attention from the video recordings. Additionally, learners' static background +data and their learning performance measures are tracked using LOGGE tool. +M2LADS provides opportunities to capture learners' holistic experience during +their interactions with the learning analytic system in order to improve the +system and the user experience of the learners. + -- + En este art\'iculo, presentamos M2LADS, un sistema que permite la +integraci\'on y visualizaci\'on de datos multimodales en forma de Dashboards +Web. Estos datos provienen de sesiones de experiencia de usuario en un sistema +de Learning Analytics (LA) llevadas a cabo por estudiantes de MOOCs. Los datos +multimodales incluyen se\~nales biom\'etricas y de comportamiento monitorizados +por la plataforma edBB, como electroencefalogramas (EEG) de 5 canales, +frecuencia card\'iaca, atenci\'on visual, videos en el espectro visible y NIR, +entre otros. Adem\'as, se incluyen datos de interacci\'on de los estudiantes +con el sistema de LA a trav\'es de la herramienta LOGGE. Toda esta +informaci\'on proporciona una comprensi\'on completa de la experiencia del +usuario al utilizar el sistema de LA, lo que ha permitido tanto mejorar el +sistema LA como la experiencia de aprendizaje de los estudiantes de MOOCs. + +
+
+ comment: Accepted in "XXIII CONGRESO INTERNACIONAL DE INTERACCI\'ON + PERSONA-ORDENADOR 2023". Article in Spanish language. The abstract in English + and Spanish. There is an extended abstract of 2 pages in English +
+
+
+
+
+ + ♻ ☆ Self-Supervised Learning for Videos: A Survey + + +
+ The remarkable success of deep learning in various domains relies on the +availability of large-scale annotated datasets. However, obtaining annotations +is expensive and requires great effort, which is especially challenging for +videos. Moreover, the use of human-generated annotations leads to models with +biased learning and poor domain generalization and robustness. As an +alternative, self-supervised learning provides a way for representation +learning which does not require annotations and has shown promise in both image +and video domains. Different from the image domain, learning video +representations are more challenging due to the temporal dimension, bringing in +motion and other environmental dynamics. This also provides opportunities for +video-exclusive ideas that advance self-supervised learning in the video and +multimodal domain. In this survey, we provide a review of existing approaches +on self-supervised learning focusing on the video domain. We summarize these +methods into four different categories based on their learning objectives: 1) +pretext tasks, 2) generative learning, 3) contrastive learning, and 4) +cross-modal agreement. We further introduce the commonly used datasets, +downstream evaluation tasks, insights into the limitations of existing works, +and the potential future directions in this area. + +
+
+ comment: ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q +
+
+
+
+
+ + ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio + Pretraining for Speech Emotion Recognition + + +
+ Contrastive learning based cross-modality pretraining methods have recently +exhibited impressive success in diverse fields. In this paper, we propose +GEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio +pretraining (CLAP) method for speech emotion recognition. Specifically, a novel +emotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised +pre-trained models. Second, considering the importance of gender attribute in +speech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and +multi-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to +integrate the emotion and gender information of speech signals, forming more +reasonable objectives. Extensive experiments on IEMOCAP show that our proposed +two GEmo-CLAP models consistently outperform the baseline Emo-CLAP with +different pre-trained models, while also achieving the best recognition +performance compared with recent state-of-the-art methods. Noticeably, the +proposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\% and WAR of +82.06\%. + +
+
+ comment: 5 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 46 + +
+
+
+ + ☆ Overthinking the Truth: Understanding how Language Models Process False + Demonstrations + + +
+ Modern language models can imitate complex patterns through few-shot +learning, enabling them to complete challenging tasks without fine-tuning. +However, imitation can also lead models to reproduce inaccuracies or harmful +content if present in the context. We study harmful imitation through the lens +of a model's internal representations, and identify two related phenomena: +overthinking and false induction heads. The first phenomenon, overthinking, +appears when we decode predictions from intermediate layers, given correct vs. +incorrect few-shot demonstrations. At early layers, both demonstrations induce +similar model behavior, but the behavior diverges sharply at some "critical +layer", after which the accuracy given incorrect demonstrations progressively +decreases. The second phenomenon, false induction heads, are a possible +mechanistic cause of overthinking: these are heads in late layers that attend +to and copy false information from previous demonstrations, and whose ablation +reduces overthinking. Beyond scientific understanding, our results suggest that +studying intermediate model computations could be a promising avenue for +understanding and guarding against harmful model behaviors. + +
+
+
+
+
+ + ☆ ChatSpot: Bootstrapping Multimodal LLMs via Precise Referring + Instruction Tuning + + +
+ Human-AI interactivity is a critical aspect that reflects the usability of +multimodal large language models (MLLMs). However, existing end-to-end MLLMs +only allow users to interact with them through language instructions, leading +to the limitation of the interactive accuracy and efficiency. In this study, we +present precise referring instructions that utilize diverse reference +representations such as points and boxes as referring prompts to refer to the +special region. This enables MLLMs to focus on the region of interest and +achieve finer-grained interaction. Based on precise referring instruction, we +propose ChatSpot, a unified end-to-end multimodal large language model that +supports diverse forms of interactivity including mouse clicks, drag-and-drop, +and drawing boxes, which provides a more flexible and seamless interactive +experience. We also construct a multi-grained vision-language +instruction-following dataset based on existing datasets and GPT-4 generating. +Furthermore, we design a series of evaluation tasks to assess the effectiveness +of region recognition and interaction. Experimental results showcase ChatSpot's +promising performance. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ☆ A comparative analysis of SR-GAN models + + +
+ In this study, we evaluate the performance of multiple state-of-the-art SR +GAN (Super Resolution Generative Adversarial Network) models, ESRGAN, +Real-ESRGAN and EDSR, on a benchmark dataset of real-world images which undergo +degradation using a pipeline. Our results show that some models seem to +significantly increase the resolution of the input images while preserving +their visual quality, this is assessed using Tesseract OCR engine. We observe +that EDSR-BASE model from huggingface outperforms the remaining candidate +models in terms of both quantitative metrics and subjective visual quality +assessments with least compute overhead. Specifically, EDSR generates images +with higher peak signal-to-noise ratio (PSNR) and structural similarity index +(SSIM) values and are seen to return high quality OCR results with Tesseract +OCR engine. These findings suggest that EDSR is a robust and effective approach +for single-image super-resolution and may be particularly well-suited for +applications where high-quality visual fidelity is critical and optimized +compute. + +
+
+ comment: 9 pages, 6 tables, 2 figures +
+
+
+
+
+ + ☆ Pseudo Outlier Exposure for Out-of-Distribution Detection using + Pretrained Transformers + + +
+ For real-world language applications, detecting an out-of-distribution (OOD) +sample is helpful to alert users or reject such unreliable samples. However, +modern over-parameterized language models often produce overconfident +predictions for both in-distribution (ID) and OOD samples. In particular, +language models suffer from OOD samples with a similar semantic representation +to ID samples since these OOD samples lie near the ID manifold. A rejection +network can be trained with ID and diverse outlier samples to detect test OOD +samples, but explicitly collecting auxiliary OOD datasets brings an additional +burden for data collection. In this paper, we propose a simple but effective +method called Pseudo Outlier Exposure (POE) that constructs a surrogate OOD +dataset by sequentially masking tokens related to ID classes. The surrogate OOD +sample introduced by POE shows a similar representation to ID data, which is +most effective in training a rejection network. Our method does not require any +external OOD data and can be easily implemented within off-the-shelf +Transformers. A comprehensive comparison with state-of-the-art algorithms +demonstrates POE's competitiveness on several text classification benchmarks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation + Evaluation + + +
+ Research in Image Generation has recently made significant progress, +particularly boosted by the introduction of Vision-Language models which are +able to produce high-quality visual content based on textual inputs. Despite +ongoing advancements in terms of generation quality and realism, no methodical +frameworks have been defined yet to quantitatively measure the quality of the +generated content and the adherence with the prompted requests: so far, only +human-based evaluations have been adopted for quality satisfaction and for +comparing different generative methods. We introduce a novel automated method +for Visual Concept Evaluation (ViCE), i.e. to assess consistency between a +generated/edited image and the corresponding prompt/instructions, with a +process inspired by the human cognitive behaviour. ViCE combines the strengths +of Large Language Models (LLMs) and Visual Question Answering (VQA) into a +unified pipeline, aiming to replicate the human cognitive process in quality +assessment. This method outlines visual concepts, formulates image-specific +verification questions, utilizes the Q&A system to investigate the image, and +scores the combined outcome. Although this brave new hypothesis of mimicking +humans in the image evaluation process is in its preliminary assessment stage, +results are promising and open the door to a new form of automatic evaluation +which could have significant impact as the image generation or the image target +editing tasks become more and more sophisticated. + +
+
+
+
+
+ + ☆ Zero-shot Query Reformulation for Conversational Search ICTIR 2023 + + +
+ As the popularity of voice assistants continues to surge, conversational +search has gained increased attention in Information Retrieval. However, data +sparsity issues in conversational search significantly hinder the progress of +supervised conversational search methods. Consequently, researchers are +focusing more on zero-shot conversational search approaches. Nevertheless, +existing zero-shot methods face three primary limitations: they are not +universally applicable to all retrievers, their effectiveness lacks sufficient +explainability, and they struggle to resolve common conversational ambiguities +caused by omission. To address these limitations, we introduce a novel +Zero-shot Query Reformulation (ZeQR) framework that reformulates queries based +on previous dialogue contexts without requiring supervision from conversational +search data. Specifically, our framework utilizes language models designed for +machine reading comprehension tasks to explicitly resolve two common +ambiguities: coreference and omission, in raw queries. In comparison to +existing zero-shot methods, our approach is universally applicable to any +retriever without additional adaptation or indexing. It also provides greater +explainability and effectively enhances query intent understanding because +ambiguities are explicitly and proactively resolved. Through extensive +experiments on four TREC conversational datasets, we demonstrate the +effectiveness of our method, which consistently outperforms state-of-the-art +baselines. + +
+
+ comment: Accepted by ICTIR 2023 +
+
+
+
+
+ + ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks. In contrast to traditional text-only methods, our approach to +labelling a comment as hate speech centers around the holistic analysis of text +and images. This is done by leveraging graph transformers to capture the +contextual relationships in the entire discussion that surrounds a comment, +with interwoven fusion layers to combine text and image embeddings instead of +processing different modalities separately. We compare the performance of our +model to baselines that only process text; we also conduct extensive ablation +studies. We conclude with future work for multimodal solutions to deliver +social value in online contexts, arguing that capturing a holistic view of a +conversation greatly advances the effort to detect anti-social behavior. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ☆ Llama 2: Open Foundation and Fine-Tuned Chat Models + + +
+ In this work, we develop and release Llama 2, a collection of pretrained and +fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 +billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for +dialogue use cases. Our models outperform open-source chat models on most +benchmarks we tested, and based on our human evaluations for helpfulness and +safety, may be a suitable substitute for closed-source models. We provide a +detailed description of our approach to fine-tuning and safety improvements of +Llama 2-Chat in order to enable the community to build on our work and +contribute to the responsible development of LLMs. + +
+
+
+
+
+ + ☆ Improving Text Semantic Similarity Modeling through a 3D Siamese Network + + +
+ Siamese networks have gained popularity as a method for modeling text +semantic similarity. Traditional methods rely on pooling operation to compress +the semantic representations from Transformer blocks in encoding, resulting in +two-dimensional semantic vectors and the loss of hierarchical semantic +information from Transformer blocks. Moreover, this limited structure of +semantic vectors is akin to a flattened landscape, which restricts the methods +that can be applied in downstream modeling, as they can only navigate this flat +terrain. To address this issue, we propose a novel 3D Siamese network for text +semantic similarity modeling, which maps semantic information to a +higher-dimensional space. The three-dimensional semantic tensors not only +retains more precise spatial and feature domain information but also provides +the necessary structural condition for comprehensive downstream modeling +strategies to capture them. Leveraging this structural advantage, we introduce +several modules to reinforce this 3D framework, focusing on three aspects: +feature extraction, attention, and feature fusion. Our extensive experiments on +four text semantic similarity benchmarks demonstrate the effectiveness and +efficiency of our 3D Siamese Network. + +
+
+
+
+
+ + ☆ Linearized Relative Positional Encoding + + +
+ Relative positional encoding is widely used in vanilla and linear +transformers to represent positional information. However, existing encoding +methods of a vanilla transformer are not always directly applicable to a linear +transformer, because the latter requires a decomposition of the query and key +representations into separate kernel functions. Nevertheless, principles for +designing encoding methods suitable for linear transformers remain +understudied. In this work, we put together a variety of existing linear +relative positional encoding approaches under a canonical form and further +propose a family of linear relative positional encoding algorithms via unitary +transformation. Our formulation leads to a principled framework that can be +used to develop new relative positional encoding methods that preserve linear +space-time complexity. Equipped with different models, the proposed linearized +relative positional encoding (LRPE) family derives effective encoding for +various applications. Experiments show that compared with existing methods, +LRPE achieves state-of-the-art performance in language modeling, text +classification, and image classification. Meanwhile, it emphasizes a general +paradigm for designing broadly more relative positional encoding methods that +are applicable to linear transformers. The code is available at +https://github.com/OpenNLPLab/Lrpe. + +
+
+ comment: Reviewed by TMLR, decision pending. Yiran Zhong is the corresponding + author. Code is available at https://github.com/OpenNLPLab/Lrpe +
+
+
+
+
+ + ☆ Text vectorization via transformer-based language models and n-gram + perplexities + + +
+ As the probability (and thus perplexity) of a text is calculated based on the +product of the probabilities of individual tokens, it may happen that one +unlikely token significantly reduces the probability (i.e., increase the +perplexity) of some otherwise highly probable input, while potentially +representing a simple typographical error. Also, given that perplexity is a +scalar value that refers to the entire input, information about the probability +distribution within it is lost in the calculation (a relatively good text that +has one unlikely token and another text in which each token is equally likely +they can have the same perplexity value), especially for longer texts. As an +alternative to scalar perplexity this research proposes a simple algorithm used +to calculate vector values based on n-gram perplexities within the input. Such +representations consider the previously mentioned aspects, and instead of a +unique value, the relative perplexity of each text token is calculated, and +these values are combined into a single vector representing the input. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ PAC Neural Prediction Set Learning to Quantify the Uncertainty of + Generative Language Models + + +
+ Uncertainty learning and quantification of models are crucial tasks to +enhance the trustworthiness of the models. Importantly, the recent surge of +generative language models (GLMs) emphasizes the need for reliable uncertainty +quantification due to the concerns on generating hallucinated facts. In this +paper, we propose to learn neural prediction set models that comes with the +probably approximately correct (PAC) guarantee for quantifying the uncertainty +of GLMs. Unlike existing prediction set models, which are parameterized by a +scalar value, we propose to parameterize prediction sets via neural networks, +which achieves more precise uncertainty quantification but still satisfies the +PAC guarantee. We demonstrate the efficacy of our method on four types of +language datasets and six types of models by showing that our method improves +the quantified uncertainty by $63\%$ on average, compared to a standard +baseline method. + +
+
+
+
+
+ + ☆ UniTabE: Pretraining a Unified Tabular Encoder for Heterogeneous Tabular + Data + + +
+ Recent advancements in Natural Language Processing (NLP) have witnessed the +groundbreaking impact of pretrained models, yielding impressive outcomes across +various tasks. This study seeks to extend the power of pretraining +methodologies to tabular data, a domain traditionally overlooked, yet +inherently challenging due to the plethora of table schemas intrinsic to +different tasks. The primary research questions underpinning this work revolve +around the adaptation to heterogeneous table structures, the establishment of a +universal pretraining protocol for tabular data, the generalizability and +transferability of learned knowledge across tasks, the adaptation to diverse +downstream applications, and the incorporation of incremental columns over +time. In response to these challenges, we introduce UniTabE, a pioneering +method designed to process tables in a uniform manner, devoid of constraints +imposed by specific table structures. UniTabE's core concept relies on +representing each basic table element with a module, termed TabUnit. This is +subsequently followed by a Transformer encoder to refine the representation. +Moreover, our model is designed to facilitate pretraining and finetuning +through the utilization of free-form prompts. In order to implement the +pretraining phase, we curated an expansive tabular dataset comprising +approximately 13 billion samples, meticulously gathered from the Kaggle +platform. Rigorous experimental testing and analyses were performed under a +myriad of scenarios to validate the effectiveness of our methodology. The +experimental results demonstrate UniTabE's superior performance against several +baseline models across a multitude of benchmark datasets. This, therefore, +underscores UniTabE's potential to significantly enhance the semantic +representation of tabular data, thereby marking a significant stride in the +field of tabular data analysis. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Automated Ableism: An Exploration of Explicit Disability Biases in + Sentiment and Toxicity Analysis Models ACL 2023 + + +
+ We analyze sentiment analysis and toxicity detection models to detect the +presence of explicit bias against people with disability (PWD). We employ the +bias identification framework of Perturbation Sensitivity Analysis to examine +conversations related to PWD on social media platforms, specifically Twitter +and Reddit, in order to gain insight into how disability bias is disseminated +in real-world social settings. We then create the \textit{Bias Identification +Test in Sentiment} (BITS) corpus to quantify explicit disability bias in any +sentiment analysis and toxicity detection models. Our study utilizes BITS to +uncover significant biases in four open AIaaS (AI as a Service) sentiment +analysis tools, namely TextBlob, VADER, Google Cloud Natural Language API, +DistilBERT and two toxicity detection models, namely two versions of +Toxic-BERT. Our findings indicate that all of these models exhibit +statistically significant explicit bias against PWD. + +
+
+ comment: TrustNLP at ACL 2023 +
+
+
+
+
+ + ☆ Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and + Addressing Sociological Implications + + +
+ Gender bias in artificial intelligence (AI) and natural language processing +has garnered significant attention due to its potential impact on societal +perceptions and biases. This research paper aims to analyze gender bias in +Large Language Models (LLMs) with a focus on multiple comparisons between GPT-2 +and GPT-3.5, some prominent language models, to better understand its +implications. Through a comprehensive literature review, the study examines +existing research on gender bias in AI language models and identifies gaps in +the current knowledge. The methodology involves collecting and preprocessing +data from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis +techniques to evaluate gender bias in the generated text. The findings shed +light on gendered word associations, language usage, and biased narratives +present in the outputs of these Large Language Models. The discussion explores +the ethical implications of gender bias and its potential consequences on +social perceptions and marginalized communities. Additionally, the paper +presents strategies for reducing gender bias in LLMs, including algorithmic +approaches and data augmentation techniques. The research highlights the +importance of interdisciplinary collaborations and the role of sociological +studies in mitigating gender bias in AI models. By addressing these issues, we +can pave the way for more inclusive and unbiased AI systems that have a +positive impact on society. + +
+
+
+
+
+ + ☆ Attention over pre-trained Sentence Embeddings for Long Document + Classification + + +
+ Despite being the current de-facto models in most NLP tasks, transformers are +often limited to short sequences due to their quadratic attention complexity on +the number of tokens. Several attempts to address this issue were studied, +either by reducing the cost of the self-attention computation or by modeling +smaller sequences and combining them through a recurrence mechanism or using a +new transformer model. In this paper, we suggest to take advantage of +pre-trained sentence transformers to start from semantically meaningful +embeddings of the individual sentences, and then combine them through a small +attention layer that scales linearly with the document length. We report the +results obtained by this simple architecture on three standard document +classification datasets. When compared with the current state-of-the-art models +using standard fine-tuning, the studied method obtains competitive results +(even if there is no clear best model in this configuration). We also showcase +that the studied architecture obtains better results when freezing the +underlying transformers. A configuration that is useful when we need to avoid +complete fine-tuning (e.g. when the same frozen transformer is shared by +different applications). Finally, two additional experiments are provided to +further evaluate the relevancy of the studied architecture over simpler +baselines. + +
+
+
+
+
+ + ☆ Unleashing the Imagination of Text: A Novel Framework for Text-to-image + Person Retrieval via Exploring the Power of Words + + +
+ The goal of Text-to-image person retrieval is to retrieve person images from +a large gallery that match the given textual descriptions. The main challenge +of this task lies in the significant differences in information representation +between the visual and textual modalities. The textual modality conveys +abstract and precise information through vocabulary and grammatical structures, +while the visual modality conveys concrete and intuitive information through +images. To fully leverage the expressive power of textual representations, it +is essential to accurately map abstract textual descriptions to specific +images. + To address this issue, we propose a novel framework to Unleash the +Imagination of Text (UIT) in text-to-image person retrieval, aiming to fully +explore the power of words in sentences. Specifically, the framework employs +the pre-trained full CLIP model as a dual encoder for the images and texts , +taking advantage of prior cross-modal alignment knowledge. The Text-guided +Image Restoration auxiliary task is proposed with the aim of implicitly mapping +abstract textual entities to specific image regions, facilitating alignment +between textual and visual embeddings. Additionally, we introduce a cross-modal +triplet loss tailored for handling hard samples, enhancing the model's ability +to distinguish minor differences. + To focus the model on the key components within sentences, we propose a novel +text data augmentation technique. Our proposed methods achieve state-of-the-art +results on three popular benchmark datasets, and the source code will be made +publicly available shortly. + +
+
+
+
+
+ + ☆ Towards a Neural Era in Dialogue Management for Collaboration: A + Literature Survey + + +
+ Dialogue-based human-AI collaboration can revolutionize collaborative +problem-solving, creative exploration, and social support. To realize this +goal, the development of automated agents proficient in skills such as +negotiating, following instructions, establishing common ground, and +progressing shared tasks is essential. This survey begins by reviewing the +evolution of dialogue management paradigms in collaborative dialogue systems, +from traditional handcrafted and information-state based methods to AI +planning-inspired approaches. It then shifts focus to contemporary data-driven +dialogue management techniques, which seek to transfer deep learning successes +from form-filling and open-domain settings to collaborative contexts. The paper +proceeds to analyze a selected set of recent works that apply neural approaches +to collaborative dialogue management, spotlighting prevailing trends in the +field. This survey hopes to provide foundational background for future +advancements in collaborative dialogue management, particularly as the dialogue +systems community continues to embrace the potential of large language models. + +
+
+
+
+
+ + ☆ Exploring acceptance of autonomous vehicle policies using KeyBERT and + SNA: Targeting engineering students + + +
+ This study aims to explore user acceptance of Autonomous Vehicle (AV) +policies with improved text-mining methods. Recently, South Korean policymakers +have viewed Autonomous Driving Car (ADC) and Autonomous Driving Robot (ADR) as +next-generation means of transportation that will reduce the cost of +transporting passengers and goods. They support the construction of V2I and V2V +communication infrastructures for ADC and recognize that ADR is equivalent to +pedestrians to promote its deployment into sidewalks. To fill the gap where +end-user acceptance of these policies is not well considered, this study +applied two text-mining methods to the comments of graduate students in the +fields of Industrial, Mechanical, and Electronics-Electrical-Computer. One is +the Co-occurrence Network Analysis (CNA) based on TF-IWF and Dice coefficient, +and the other is the Contextual Semantic Network Analysis (C-SNA) based on both +KeyBERT, which extracts keywords that contextually represent the comments, and +double cosine similarity. The reason for comparing these approaches is to +balance interest not only in the implications for the AV policies but also in +the need to apply quality text mining to this research domain. Significantly, +the limitation of frequency-based text mining, which does not reflect textual +context, and the trade-off of adjusting thresholds in Semantic Network Analysis +(SNA) were considered. As the results of comparing the two approaches, the +C-SNA provided the information necessary to understand users' voices using +fewer nodes and features than the CNA. The users who pre-emptively understood +the AV policies based on their engineering literacy and the given texts +revealed potential risks of the AV accident policies. This study adds +suggestions to manage these risks to support the successful deployment of AVs +on public roads. + +
+
+ comment: 29 pages with 11 figures +
+
+
+
+
+ + ☆ How is ChatGPT's behavior changing over time? + + +
+ GPT-3.5 and GPT-4 are the two most widely used large language model (LLM) +services. However, when and how these models are updated over time is opaque. +Here, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on +four diverse tasks: 1) solving math problems, 2) answering sensitive/dangerous +questions, 3) generating code and 4) visual reasoning. We find that the +performance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time. +For example, GPT-4 (March 2023) was very good at identifying prime numbers +(accuracy 97.6%) but GPT-4 (June 2023) was very poor on these same questions +(accuracy 2.4%). Interestingly GPT-3.5 (June 2023) was much better than GPT-3.5 +(March 2023) in this task. GPT-4 was less willing to answer sensitive questions +in June than in March, and both GPT-4 and GPT-3.5 had more formatting mistakes +in code generation in June than in March. Overall, our findings shows that the +behavior of the same LLM service can change substantially in a relatively short +amount of time, highlighting the need for continuous monitoring of LLM quality. + +
+
+
+
+
+ + ☆ On the (In)Effectiveness of Large Language Models for Chinese Text + Correction + + +
+ Recently, the development and progress of Large Language Models (LLMs) have +amazed the entire Artificial Intelligence community. As an outstanding +representative of LLMs and the foundation model that set off this wave of +research on LLMs, ChatGPT has attracted more and more researchers to study its +capabilities and performance on various downstream Natural Language Processing +(NLP) tasks. While marveling at ChatGPT's incredible performance on kinds of +tasks, we notice that ChatGPT also has excellent multilingual processing +capabilities, such as Chinese. To explore the Chinese processing ability of +ChatGPT, we focus on Chinese Text Correction, a fundamental and challenging +Chinese NLP task. Specifically, we evaluate ChatGPT on the Chinese Grammatical +Error Correction (CGEC) and Chinese Spelling Check (CSC) tasks, which are two +main Chinese Text Correction scenarios. From extensive analyses and comparisons +with previous state-of-the-art fine-tuned models, we empirically find that the +ChatGPT currently has both amazing performance and unsatisfactory behavior for +Chinese Text Correction. We believe our findings will promote the landing and +application of LLMs in the Chinese NLP community. + +
+
+ comment: Work in progress! +
+
+
+
+
+ + ☆ Mitigating Label Bias via Decoupled Confident Learning ICML + + +
+ Growing concerns regarding algorithmic fairness have led to a surge in +methodologies to mitigate algorithmic bias. However, such methodologies largely +assume that observed labels in training data are correct. This is problematic +because bias in labels is pervasive across important domains, including +healthcare, hiring, and content moderation. In particular, human-generated +labels are prone to encoding societal biases. While the presence of labeling +bias has been discussed conceptually, there is a lack of methodologies to +address this problem. We propose a pruning method -- Decoupled Confident +Learning (DeCoLe) -- specifically designed to mitigate label bias. After +illustrating its performance on a synthetic dataset, we apply DeCoLe in the +context of hate speech detection, where label bias has been recognized as an +important challenge, and show that it successfully identifies biased labels and +outperforms competing approaches. + +
+
+ comment: AI & HCI Workshop at the 40th International Conference on Machine + Learning (ICML), Honolulu, Hawaii, USA. 2023 +
+
+
+
+
+ + ☆ NTK-approximating MLP Fusion for Efficient Language Model Fine-tuning ICML 2023 + + +
+ Fine-tuning a pre-trained language model (PLM) emerges as the predominant +strategy in many natural language processing applications. However, even +fine-tuning the PLMs and doing inference are expensive, especially on edge +devices with low computing power. Some general approaches (e.g. quantization +and distillation) have been widely studied to reduce the compute/memory of PLM +fine-tuning, while very few one-shot compression techniques are explored. In +this paper, we investigate the neural tangent kernel (NTK)--which reveals the +gradient descent dynamics of neural networks--of the multilayer perceptrons +(MLP) modules in a PLM and propose to coin a lightweight PLM through +NTK-approximating MLP fusion. To achieve this, we reconsider the MLP as a +bundle of sub-MLPs, and cluster them into a given number of centroids, which +can then be restored as a compressed MLP and surprisingly shown to well +approximate the NTK of the original PLM. Extensive experiments of PLM +fine-tuning on both natural language understanding (NLU) and generation (NLG) +tasks are provided to verify the effectiveness of the proposed method MLP +fusion. Our code is available at https://github.com/weitianxin/MLP_Fusion. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ☆ Teach model to answer questions after comprehending the document + + +
+ Multi-choice Machine Reading Comprehension (MRC) is a challenging extension +of Natural Language Processing (NLP) that requires the ability to comprehend +the semantics and logical relationships between entities in a given text. The +MRC task has traditionally been viewed as a process of answering questions +based on the given text. This single-stage approach has often led the network +to concentrate on generating the correct answer, potentially neglecting the +comprehension of the text itself. As a result, many prevalent models have faced +challenges in performing well on this task when dealing with longer texts. In +this paper, we propose a two-stage knowledge distillation method that teaches +the model to better comprehend the document by dividing the MRC task into two +separate stages. Our experimental results show that the student model, when +equipped with our method, achieves significant improvements, demonstrating the +effectiveness of our method. + +
+
+
+
+
+ + ☆ Federated Large Language Model: A Position Paper + + +
+ Large scale language models (LLM) have received significant attention and +found diverse applications across various domains, but their development +encounters challenges in real-world scenarios. These challenges arise due to +the scarcity of public domain data availability and the need to maintain +privacy with respect to private domain data. To address these issues, federated +learning (FL) has emerged as a promising technology that enables collaborative +training of shared models while preserving decentralized data. We propose the +concept of federated LLM, which comprises three key components, i.e., federated +LLM pre-training, federated LLM fine-tuning, and federated LLM prompt +engineering. For each component, we discuss its advantage over traditional LLM +training methods and propose specific engineering strategies for +implementation. Furthermore, we explore the novel challenges introduced by the +integration of FL and LLM. We analyze existing solutions and identify potential +obstacles faced by these solutions within the context of federated LLM. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Large Language Models Perform Diagnostic Reasoning ICLR 2023 + + +
+ We explore the extension of chain-of-thought (CoT) prompting to medical +reasoning for the task of automatic diagnosis. Motivated by doctors' underlying +reasoning process, we present Diagnostic-Reasoning CoT (DR-CoT). Empirical +results demonstrate that by simply prompting large language models trained only +on general text corpus with two DR-CoT exemplars, the diagnostic accuracy +improves by 15% comparing to standard prompting. Moreover, the gap reaches a +pronounced 18% in out-domain settings. Our findings suggest expert-knowledge +reasoning in large language models can be elicited through proper promptings. + +
+
+ comment: Accepted as a Tiny Paper at ICLR 2023 (10 pages, 5 figures) +
+
+
+
+
+ + ☆ Can Model Fusing Help Transformers in Long Document Classification? An + Empirical Study + + +
+ Text classification is an area of research which has been studied over the +years in Natural Language Processing (NLP). Adapting NLP to multiple domains +has introduced many new challenges for text classification and one of them is +long document classification. While state-of-the-art transformer models provide +excellent results in text classification, most of them have limitations in the +maximum sequence length of the input sequence. The majority of the transformer +models are limited to 512 tokens, and therefore, they struggle with long +document classification problems. In this research, we explore on employing +Model Fusing for long document classification while comparing the results with +well-known BERT and Longformer architectures. + +
+
+ comment: Accepted in RANLP 2023 +
+
+
+
+
+ + ☆ Analyzing sports commentary in order to automatically recognize events + and extract insights + + +
+ In this paper, we carefully investigate how we can use multiple different +Natural Language Processing techniques and methods in order to automatically +recognize the main actions in sports events. We aim to extract insights by +analyzing live sport commentaries from different sources and by classifying +these major actions into different categories. We also study if sentiment +analysis could help detect these main actions. + +
+
+
+
+
+ + ☆ The Language Labyrinth: Constructive Critique on the Terminology Used in + the AI Discourse + + +
+ In the interdisciplinary field of artificial intelligence (AI) the problem of +clear terminology is especially momentous. This paper claims, that AI debates +are still characterised by a lack of critical distance to metaphors like +'training', 'learning' or 'deciding'. As consequence, reflections regarding +responsibility or potential use-cases are greatly distorted. Yet, if relevant +decision-makers are convinced that AI can develop an 'understanding' or +properly 'interpret' issues, its regular use for sensitive tasks like deciding +about social benefits or judging court cases looms. The chapter argues its +claim by analysing central notions of the AI debate and tries to contribute by +proposing more fitting terminology and hereby enabling more fruitful debates. +It is a conceptual work at the intersection of critical computer science and +philosophy of language. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Mutual Reinforcement Effects in Japanese Sentence Classification and + Named Entity Recognition Tasks + + +
+ Information extraction(IE) is a crucial subfield within natural language +processing. However, for the traditionally segmented approach to sentence +classification and Named Entity Recognition, the intricate interactions between +these individual subtasks remain largely uninvestigated. In this study, we +propose an integrative analysis, converging sentence classification with Named +Entity Recognition, with the objective to unveil and comprehend the mutual +reinforcement effect within these two information extraction subtasks. To +achieve this, we introduce a Sentence Classification and Named Entity +Recognition Multi-task (SCNM) approach that combines Sentence Classification +(SC) and Named Entity Recognition (NER). We develop a Sentence-to-Label +Generation (SLG) framework for SCNM and construct a Wikipedia dataset +containing both SC and NER. Using a format converter, we unify input formats +and employ a generative model to generate SC-labels, NER-labels, and associated +text segments. We propose a Constraint Mechanism (CM) to improve generated +format accuracy. Our results show SC accuracy increased by 1.13 points and NER +by 1.06 points in SCNM compared to standalone tasks, with CM raising format +accuracy from 63.61 to 100. The findings indicate mutual reinforcement effects +between SC and NER, and integration enhances both tasks' performance. We +additionally implemented the SLG framework on single SC task. It yielded +superior accuracies compared to the baseline on two distinct Japanese SC +datasets. Notably, in the experiment of few-shot learning, SLG framework shows +much better performance than fine-tune method. These empirical findings +contribute additional evidence to affirm the efficacy of the SLG framework. + +
+
+ comment: 25 pages, 12 figures, 19 tables. arXiv admin note: substantial text + overlap with arXiv:2306.15978 +
+
+
+
+
+ + ☆ Zero-shot Domain-sensitive Speech Recognition with Prompt-conditioning + Fine-tuning + + +
+ In this work, we propose a method to create domain-sensitive speech +recognition models that utilize textual domain information by conditioning its +generation on a given text prompt. This is accomplished by fine-tuning a +pre-trained, end-to-end model (Whisper) to learn from demonstrations with +prompt examples. We show that this ability can be generalized to different +domains and even various prompt contexts, with our model gaining a Word Error +Rate (WER) reduction of up to 33% on unseen datasets from various domains, such +as medical conversation, air traffic control communication, and financial +meetings. Considering the limited availability of audio-transcript pair data, +we further extend our method to text-only fine-tuning to achieve domain +sensitivity as well as domain adaptation. We demonstrate that our text-only +fine-tuned model can also attend to various prompt contexts, with the model +reaching the most WER reduction of 29% on the medical conversation dataset. + +
+
+ comment: F-T Liao and Y-C Chan contributed equally +
+
+
+
+
+ + ♻ ☆ SparseOptimizer: Sparsify Language Models through Moreau-Yosida + Regularization and Accelerate via Compiler Co-design + + +
+ This paper introduces SparseOptimizer, a novel deep learning optimizer that +exploits Moreau-Yosida regularization to naturally induce sparsity in large +language models such as BERT, ALBERT and GPT. Key to the design of +SparseOptimizer is an embedded shrinkage operator, which imparts sparsity +directly within the optimization process. This operator, backed by a sound +theoretical framework, includes an analytical solution, thereby reinforcing the +optimizer's robustness and efficacy. Crucially, SparseOptimizer's plug-and-play +functionality eradicates the need for code modifications, making it a +universally adaptable tool for a wide array of large language models. Empirical +evaluations on benchmark datasets such as GLUE, RACE, SQuAD1, and SQuAD2 +confirm that SparseBERT and SparseALBERT, when sparsified using +SparseOptimizer, achieve performance comparable to their dense counterparts, +BERT and ALBERT, while significantly reducing their parameter count. Further, +this work proposes an innovative optimizer-compiler co-design strategy, +demonstrating the potential of inference acceleration (\textbf{3.37x}, +\textbf{6.30x}, and \textbf{7.15x} in comparison with Pytorch, TensorFlow, and +LLVM generic compile, respectively) in SparseBERT when paired with an +appropriately designed compiler. This study represents a significant step +forward in the evolution of efficient, scalable, and high-performing large +language models, setting a precedent for future exploration and optimization in +this domain. The SparseOptimizer code and SparseALBERT model will be publicly +available upon paper acceptance. + +
+
+
+
+
+ + ♻ ☆ Execution-based Code Generation using Deep Reinforcement Learning + + +
+ The utilization of programming language (PL) models, pre-trained on +large-scale code corpora, as a means of automating software engineering +processes has demonstrated considerable potential in streamlining various code +generation tasks such as code completion, code translation, and program +synthesis. However, current approaches mainly rely on supervised fine-tuning +objectives borrowed from text generation, neglecting unique sequence-level +characteristics of code, including but not limited to compilability as well as +syntactic and functional correctness. To address this limitation, we propose +PPOCoder, a new framework for code generation that synergistically combines +pre-trained PL models with Proximal Policy Optimization (PPO) which is a widely +used deep reinforcement learning technique. By utilizing non-differentiable +feedback from code execution and structure alignment, PPOCoder seamlessly +integrates external code-specific knowledge into the model optimization +process. It's important to note that PPOCoder is a task-agnostic and +model-agnostic framework that can be used across different code generation +tasks and PLs. Extensive experiments on three code generation tasks demonstrate +the effectiveness of our proposed approach compared to SOTA methods, achieving +significant improvements in compilation success rates and functional +correctness across different PLs. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR), 2023 +
+
+
+
+
+ + ♻ ☆ On the Interpretability and Significance of Bias Metrics in Texts: a + PMI-based Approach ACL 2023 + + +
+ In recent years, word embeddings have been widely used to measure biases in +texts. Even if they have proven to be effective in detecting a wide variety of +biases, metrics based on word embeddings lack transparency and +interpretability. We analyze an alternative PMI-based metric to quantify biases +in texts. It can be expressed as a function of conditional probabilities, which +provides a simple interpretation in terms of word co-occurrences. We also prove +that it can be approximated by an odds ratio, which allows estimating +confidence intervals and statistical significance of textual biases. This +approach produces similar results to metrics based on word embeddings when +capturing gender gaps of the real world embedded in large corpora. + +
+
+ comment: Camera Ready for ACL 2023 (main conference) +
+
+
+
+
+ + ♻ ☆ Evaluating Open-QA Evaluation + + +
+ This study focuses on the evaluation of the Open Question Answering (Open-QA) +task, which can directly estimate the factuality of large language models +(LLMs). Current automatic evaluation methods have shown limitations, indicating +that human evaluation still remains the most reliable approach. We introduce a +new task, Evaluating QA Evaluation (QA-Eval) and the corresponding dataset +EVOUNA, designed to assess the accuracy of AI-generated answers in relation to +standard answers within Open-QA. Our evaluation of these methods utilizes +human-annotated results to measure their performance. Specifically, the work +investigates methods that show high correlation with human evaluations, deeming +them more reliable. We also discuss the pitfalls of current methods and methods +to improve LLM-based evaluators. We believe this new QA-Eval task and +corresponding dataset EVOUNA will facilitate the development of more effective +automatic evaluation tools and prove valuable for future research in this area. +All resources are available at \url{https://github.com/wangcunxiang/QA-Eval} +and it is under the Apache-2.0 License. + +
+
+
+
+
+ + ♻ ☆ Synthetic Text Generation with Differential Privacy: A Simple and + Practical Recipe ACL 2023 + + +
+ Privacy concerns have attracted increasing attention in data-driven products +due to the tendency of machine learning models to memorize sensitive training +data. Generating synthetic versions of such data with a formal privacy +guarantee, such as differential privacy (DP), provides a promising path to +mitigating these privacy concerns, but previous approaches in this direction +have typically failed to produce synthetic data of high quality. In this work, +we show that a simple and practical recipe in the text domain is effective: +simply fine-tuning a pretrained generative language model with DP enables the +model to generate useful synthetic text with strong privacy protection. Through +extensive empirical analyses on both benchmark and private customer data, we +demonstrate that our method produces synthetic text that is competitive in +terms of utility with its non-private counterpart, meanwhile providing strong +protection against potential privacy leakages. + +
+
+ comment: ACL 2023 Main Conference (Honorable Mention) +
+
+
+
+
+ + ♻ ☆ Evaluating GPT-3.5 and GPT-4 on Grammatical Error Correction for + Brazilian Portuguese ICML 2023 + + +
+ We investigate the effectiveness of GPT-3.5 and GPT-4, two large language +models, as Grammatical Error Correction (GEC) tools for Brazilian Portuguese +and compare their performance against Microsoft Word and Google Docs. We +introduce a GEC dataset for Brazilian Portuguese with four categories: Grammar, +Spelling, Internet, and Fast typing. Our results show that while GPT-4 has +higher recall than other methods, LLMs tend to have lower precision, leading to +overcorrection. This study demonstrates the potential of LLMs as practical GEC +tools for Brazilian Portuguese and encourages further exploration of LLMs for +non-English languages and other educational settings. + +
+
+ comment: Download the full source to access the dataset. Accepted to LatinX in + AI (LXAI) Research at ICML 2023 +
+
+
+
+
+ + ♻ ☆ A Human Word Association based model for topic detection in social + networks + + +
+ With the widespread use of social networks, detecting the topics discussed in +these networks has become a significant challenge. The current works are mainly +based on frequent pattern mining or semantic relations, and the language +structure is not considered. The meaning of language structural methods is to +discover the relationship between words and how humans understand them. +Therefore, this paper uses the Concept of the Imitation of the Mental Ability +of Word Association to propose a topic detection framework in social networks. +This framework is based on the Human Word Association method. A special +extraction algorithm has also been designed for this purpose. The performance +of this method is evaluated on the FA-CUP dataset. It is a benchmark dataset in +the field of topic detection. The results show that the proposed method is a +good improvement compared to other methods, based on the Topic-recall and the +keyword F1 measure. Also, most of the previous works in the field of topic +detection are limited to the English language, and the Persian language, +especially microblogs written in this language, is considered a low-resource +language. Therefore, a data set of Telegram posts in the Farsi language has +been collected. Applying the proposed method to this dataset also shows that +this method works better than other topic detection methods. + +
+
+
+
+
+ + ♻ ☆ Persian topic detection based on Human Word association and graph + embedding + + +
+ In this paper, we propose a framework to detect topics in social media based +on Human Word Association. Identifying topics discussed in these media has +become a critical and significant challenge. Most of the work done in this area +is in English, but much has been done in the Persian language, especially +microblogs written in Persian. Also, the existing works focused more on +exploring frequent patterns or semantic relationships and ignored the +structural methods of language. In this paper, a topic detection framework +using HWA, a method for Human Word Association, is proposed. This method uses +the concept of imitation of mental ability for word association. This method +also calculates the Associative Gravity Force that shows how words are related. +Using this parameter, a graph can be generated. The topics can be extracted by +embedding this graph and using clustering methods. This approach has been +applied to a Persian language dataset collected from Telegram. Several +experimental studies have been performed to evaluate the proposed framework's +performance. Experimental results show that this approach works better than +other topic detection methods. + +
+
+
+
+
+ + ♻ ☆ Communicative Agents for Software Development + + +
+ Software engineering is a domain characterized by intricate decision-making +processes, often relying on nuanced intuition and consultation. Recent +advancements in deep learning have started to revolutionize software +engineering practices through elaborate designs implemented at various stages +of software development. In this paper, we present an innovative paradigm that +leverages large language models (LLMs) throughout the entire software +development process, streamlining and unifying key processes through natural +language communication, thereby eliminating the need for specialized models at +each phase. At the core of this paradigm lies ChatDev, a virtual chat-powered +software development company that mirrors the established waterfall model, +meticulously dividing the development process into four distinct chronological +stages: designing, coding, testing, and documenting. Each stage engages a team +of agents, such as programmers, code reviewers, and test engineers, fostering +collaborative dialogue and facilitating a seamless workflow. The chat chain +acts as a facilitator, breaking down each stage into atomic subtasks. This +enables dual roles, allowing for proposing and validating solutions through +context-aware communication, leading to efficient resolution of specific +subtasks. The instrumental analysis of ChatDev highlights its remarkable +efficacy in software generation, enabling the completion of the entire software +development process in under seven minutes at a cost of less than one dollar. +It not only identifies and alleviates potential vulnerabilities but also +rectifies potential hallucinations while maintaining commendable efficiency and +cost-effectiveness. The potential of ChatDev unveils fresh possibilities for +integrating LLMs into the realm of software development. + +
+
+ comment: 25 pages, 9 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Secrets of RLHF in Large Language Models Part I: PPO + + +
+ Large language models (LLMs) have formulated a blueprint for the advancement +of artificial general intelligence. Its primary objective is to function as a +human-centric (helpful, honest, and harmless) assistant. Alignment with humans +assumes paramount significance, and reinforcement learning with human feedback +(RLHF) emerges as the pivotal technological paradigm underpinning this pursuit. +Current technical routes usually include \textbf{reward models} to measure +human preferences, \textbf{Proximal Policy Optimization} (PPO) to optimize +policy model outputs, and \textbf{process supervision} to improve step-by-step +reasoning capabilities. However, due to the challenges of reward design, +environment interaction, and agent training, coupled with huge trial and error +cost of large language models, there is a significant barrier for AI +researchers to motivate the development of technical alignment and safe landing +of LLMs. The stable training of RLHF has still been a puzzle. In the first +report, we dissect the framework of RLHF, re-evaluate the inner workings of +PPO, and explore how the parts comprising PPO algorithms impact policy agent +training. We identify policy constraints being the key factor for the effective +implementation of the PPO algorithm. Therefore, we explore the PPO-max, an +advanced version of PPO algorithm, to efficiently improve the training +stability of the policy model. Based on our main results, we perform a +comprehensive analysis of RLHF abilities compared with SFT models and ChatGPT. +The absence of open-source implementations has posed significant challenges to +the investigation of LLMs alignment. Therefore, we are eager to release +technical reports, reward models and PPO codes, aiming to make modest +contributions to the advancement of LLMs. + +
+
+
+
+
+ + ♻ ☆ A Survey on Evaluation of Large Language Models + + +
+ Large language models (LLMs) are gaining increasing popularity in both +academia and industry, owing to their unprecedented performance in various +applications. As LLMs continue to play a vital role in both research and daily +use, their evaluation becomes increasingly critical, not only at the task +level, but also at the society level for better understanding of their +potential risks. Over the past years, significant efforts have been made to +examine LLMs from various perspectives. This paper presents a comprehensive +review of these evaluation methods for LLMs, focusing on three key dimensions: +what to evaluate, where to evaluate, and how to evaluate. Firstly, we provide +an overview from the perspective of evaluation tasks, encompassing general +natural language processing tasks, reasoning, medical usage, ethics, +educations, natural and social sciences, agent applications, and other areas. +Secondly, we answer the `where' and `how' questions by diving into the +evaluation methods and benchmarks, which serve as crucial components in +assessing performance of LLMs. Then, we summarize the success and failure cases +of LLMs in different tasks. Finally, we shed light on several future challenges +that lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to +researchers in the realm of LLMs evaluation, thereby aiding the development of +more proficient LLMs. Our key point is that evaluation should be treated as an +essential discipline to better assist the development of LLMs. We consistently +maintain the related open-source materials at: +https://github.com/MLGroupJLU/LLM-eval-survey. + +
+
+ comment: 25 pages; more work is at: https://llm-eval.github.io/ +
+
+
+
+
+ + ♻ ☆ InitialGAN: A Language GAN with Completely Random Initialization + + +
+ Text generative models trained via Maximum Likelihood Estimation (MLE) suffer +from the notorious exposure bias problem, and Generative Adversarial Networks +(GANs) are shown to have potential to tackle this problem. Existing language +GANs adopt estimators like REINFORCE or continuous relaxations to model word +probabilities. The inherent limitations of such estimators lead current models +to rely on pre-training techniques (MLE pre-training or pre-trained +embeddings). Representation modeling methods which are free from those +limitations, however, are seldomly explored because of their poor performance +in previous attempts. Our analyses reveal that invalid sampling methods and +unhealthy gradients are the main contributors to such unsatisfactory +performance. In this work, we present two techniques to tackle these problems: +dropout sampling and fully normalized LSTM. Based on these two techniques, we +propose InitialGAN whose parameters are randomly initialized in full. Besides, +we introduce a new evaluation metric, Least Coverage Rate, to better evaluate +the quality of generated samples. The experimental results demonstrate that +InitialGAN outperforms both MLE and other compared models. To the best of our +knowledge, it is the first time a language GAN can outperform MLE without using +any pre-training techniques. + +
+
+
+
+
+ + ♻ ☆ GIFT: Graph-Induced Fine-Tuning for Multi-Party Conversation + Understanding ACL 2023 + + +
+ Addressing the issues of who saying what to whom in multi-party conversations +(MPCs) has recently attracted a lot of research attention. However, existing +methods on MPC understanding typically embed interlocutors and utterances into +sequential information flows, or utilize only the superficial of inherent graph +structures in MPCs. To this end, we present a plug-and-play and lightweight +method named graph-induced fine-tuning (GIFT) which can adapt various +Transformer-based pre-trained language models (PLMs) for universal MPC +understanding. In detail, the full and equivalent connections among utterances +in regular Transformer ignore the sparse but distinctive dependency of an +utterance on another in MPCs. To distinguish different relationships between +utterances, four types of edges are designed to integrate graph-induced signals +into attention mechanisms to refine PLMs originally designed for processing +sequential texts. We evaluate GIFT by implementing it into three PLMs, and test +the performance on three downstream tasks including addressee recognition, +speaker identification and response selection. Experimental results show that +GIFT can significantly improve the performance of three PLMs on three +downstream tasks and two benchmarks with only 4 additional parameters per +encoding layer, achieving new state-of-the-art performance on MPC +understanding. + +
+
+ comment: Accepted by ACL 2023. arXiv admin note: substantial text overlap with + arXiv:2106.01541 +
+
+
+
+
+ + ♻ ☆ Jointly Extracting Interventions, Outcomes, and Findings from RCT + Reports with LLMs + + +
+ Results from Randomized Controlled Trials (RCTs) establish the comparative +effectiveness of interventions, and are in turn critical inputs for +evidence-based care. However, results from RCTs are presented in (often +unstructured) natural language articles describing the design, execution, and +outcomes of trials; clinicians must manually extract findings pertaining to +interventions and outcomes of interest from such articles. This onerous manual +process has motivated work on (semi-)automating extraction of structured +evidence from trial reports. In this work we propose and evaluate a +text-to-text model built on instruction-tuned Large Language Models (LLMs) to +jointly extract Interventions, Outcomes, and Comparators (ICO elements) from +clinical abstracts, and infer the associated results reported. Manual (expert) +and automated evaluations indicate that framing evidence extraction as a +conditional generation task and fine-tuning LLMs for this purpose realizes +considerable ($\sim$20 point absolute F1 score) gains over the previous SOTA. +We perform ablations and error analyses to assess aspects that contribute to +model performance, and to highlight potential directions for further +improvements. We apply our model to a collection of published RCTs through +mid-2022, and release a searchable database of structured findings: +http://ico-relations.ebm-nlp.com + +
+
+ comment: Accepted to MLHC 2023 +
+
+
+
+
+ + ♻ ☆ ForecastTKGQuestions: A Benchmark for Temporal Question Answering and + Forecasting over Temporal Knowledge Graphs ISWC 2023 + + +
+ Question answering over temporal knowledge graphs (TKGQA) has recently found +increasing interest. TKGQA requires temporal reasoning techniques to extract +the relevant information from temporal knowledge bases. The only existing TKGQA +dataset, i.e., CronQuestions, consists of temporal questions based on the facts +from a fixed time period, where a temporal knowledge graph (TKG) spanning the +same period can be fully used for answer inference, allowing the TKGQA models +to use even the future knowledge to answer the questions based on the past +facts. In real-world scenarios, however, it is also common that given the +knowledge until now, we wish the TKGQA systems to answer the questions asking +about the future. As humans constantly seek plans for the future, building +TKGQA systems for answering such forecasting questions is important. +Nevertheless, this has still been unexplored in previous research. In this +paper, we propose a novel task: forecasting question answering over temporal +knowledge graphs. We also propose a large-scale TKGQA benchmark dataset, i.e., +ForecastTKGQuestions, for this task. It includes three types of questions, +i.e., entity prediction, yes-no, and fact reasoning questions. For every +forecasting question in our dataset, QA models can only have access to the TKG +information before the timestamp annotated in the given question for answer +inference. We find that the state-of-the-art TKGQA methods perform poorly on +forecasting questions, and they are unable to answer yes-no questions and fact +reasoning questions. To this end, we propose ForecastTKGQA, a TKGQA model that +employs a TKG forecasting module for future inference, to answer all three +types of questions. Experimental results show that ForecastTKGQA outperforms +recent TKGQA methods on the entity prediction questions, and it also shows +great effectiveness in answering the other two types of questions. + +
+
+ comment: Accepted to ISWC 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 127 + +
+
+
+ + ☆ AnyDoor: Zero-shot Object-level Image Customization + + +
+ This work presents AnyDoor, a diffusion-based image generator with the power +to teleport target objects to new scenes at user-specified locations in a +harmonious way. Instead of tuning parameters for each object, our model is +trained only once and effortlessly generalizes to diverse object-scene +combinations at the inference stage. Such a challenging zero-shot setting +requires an adequate characterization of a certain object. To this end, we +complement the commonly used identity feature with detail features, which are +carefully designed to maintain texture details yet allow versatile local +variations (e.g., lighting, orientation, posture, etc.), supporting the object +in favorably blending with different surroundings. We further propose to borrow +knowledge from video datasets, where we can observe various forms (i.e., along +the time axis) of a single object, leading to stronger model generalizability +and robustness. Extensive experiments demonstrate the superiority of our +approach over existing alternatives as well as its great potential in +real-world applications, such as virtual try-on and object moving. Project page +is https://damo-vilab.github.io/AnyDoor-Page/. + +
+
+
+
+
+ + ☆ FACTS: Facial Animation Creation using the Transfer of Styles + + +
+ The ability to accurately capture and express emotions is a critical aspect +of creating believable characters in video games and other forms of +entertainment. Traditionally, this animation has been achieved with artistic +effort or performance capture, both requiring costs in time and labor. More +recently, audio-driven models have seen success, however, these often lack +expressiveness in areas not correlated to the audio signal. In this paper, we +present a novel approach to facial animation by taking existing animations and +allowing for the modification of style characteristics. Specifically, we +explore the use of a StarGAN to enable the conversion of 3D facial animations +into different emotions and person-specific styles. We are able to maintain the +lip-sync of the animations with this method thanks to the use of a novel +viseme-preserving loss. + +
+
+
+
+
+ + ☆ ChatSpot: Bootstrapping Multimodal LLMs via Precise Referring + Instruction Tuning + + +
+ Human-AI interactivity is a critical aspect that reflects the usability of +multimodal large language models (MLLMs). However, existing end-to-end MLLMs +only allow users to interact with them through language instructions, leading +to the limitation of the interactive accuracy and efficiency. In this study, we +present precise referring instructions that utilize diverse reference +representations such as points and boxes as referring prompts to refer to the +special region. This enables MLLMs to focus on the region of interest and +achieve finer-grained interaction. Based on precise referring instruction, we +propose ChatSpot, a unified end-to-end multimodal large language model that +supports diverse forms of interactivity including mouse clicks, drag-and-drop, +and drawing boxes, which provides a more flexible and seamless interactive +experience. We also construct a multi-grained vision-language +instruction-following dataset based on existing datasets and GPT-4 generating. +Furthermore, we design a series of evaluation tasks to assess the effectiveness +of region recognition and interaction. Experimental results showcase ChatSpot's +promising performance. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ☆ GroupLane: End-to-End 3D Lane Detection with Channel-wise Grouping + + +
+ Efficiency is quite important for 3D lane detection due to practical +deployment demand. In this work, we propose a simple, fast, and end-to-end +detector that still maintains high detection precision. Specifically, we devise +a set of fully convolutional heads based on row-wise classification. In +contrast to previous counterparts, ours supports recognizing both vertical and +horizontal lanes. Besides, our method is the first one to perform row-wise +classification in bird-eye-view. In the heads, we split feature into multiple +groups and every group of feature corresponds to a lane instance. During +training, the predictions are associated with lane labels using the proposed +single-win one-to-one matching to compute loss, and no post-processing +operation is demanded for inference. In this way, our proposed fully +convolutional detector, GroupLane, realizes end-to-end detection like DETR. +Evaluated on 3 real world 3D lane benchmarks, OpenLane, Once-3DLanes, and +OpenLane-Huawei, GroupLane adopting ConvNext-Base as the backbone outperforms +the published state-of-the-art PersFormer by 13.6% F1 score in the OpenLane +validation set. Besides, GroupLane with ResNet18 still surpasses PersFormer by +4.9% F1 score, while the inference speed is nearly 7x faster and the FLOPs is +only 13.3% of it. + +
+
+
+
+
+ + ☆ Occlusion Aware Student Emotion Recognition based on Facial Action Unit + Detection + + +
+ Given that approximately half of science, technology, engineering, and +mathematics (STEM) undergraduate students in U.S. colleges and universities +leave by the end of the first year [15], it is crucial to improve the quality +of classroom environments. This study focuses on monitoring students' emotions +in the classroom as an indicator of their engagement and proposes an approach +to address this issue. The impact of different facial parts on the performance +of an emotional recognition model is evaluated through experimentation. To test +the proposed model under partial occlusion, an artificially occluded dataset is +introduced. The novelty of this work lies in the proposal of an occlusion-aware +architecture for facial action units (AUs) extraction, which employs attention +mechanism and adaptive feature learning. The AUs can be used later to classify +facial expressions in classroom settings. + This research paper's findings provide valuable insights into handling +occlusion in analyzing facial images for emotional engagement analysis. The +proposed experiments demonstrate the significance of considering occlusion and +enhancing the reliability of facial analysis models in classroom environments. +These findings can also be extended to other settings where occlusions are +prevalent. + +
+
+
+
+
+ + ☆ A comparative analysis of SR-GAN models + + +
+ In this study, we evaluate the performance of multiple state-of-the-art SR +GAN (Super Resolution Generative Adversarial Network) models, ESRGAN, +Real-ESRGAN and EDSR, on a benchmark dataset of real-world images which undergo +degradation using a pipeline. Our results show that some models seem to +significantly increase the resolution of the input images while preserving +their visual quality, this is assessed using Tesseract OCR engine. We observe +that EDSR-BASE model from huggingface outperforms the remaining candidate +models in terms of both quantitative metrics and subjective visual quality +assessments with least compute overhead. Specifically, EDSR generates images +with higher peak signal-to-noise ratio (PSNR) and structural similarity index +(SSIM) values and are seen to return high quality OCR results with Tesseract +OCR engine. These findings suggest that EDSR is a robust and effective approach +for single-image super-resolution and may be particularly well-suited for +applications where high-quality visual fidelity is critical and optimized +compute. + +
+
+ comment: 9 pages, 6 tables, 2 figures +
+
+
+
+
+ + ☆ Unsupervised Conditional Slot Attention for Object Centric Learning + + +
+ Extracting object-level representations for downstream reasoning tasks is an +emerging area in AI. Learning object-centric representations in an unsupervised +setting presents multiple challenges, a key one being binding an arbitrary +number of object instances to a specialized object slot. Recent object-centric +representation methods like Slot Attention utilize iterative attention to learn +composable representations with dynamic inference level binding but fail to +achieve specialized slot level binding. To address this, in this paper we +propose Unsupervised Conditional Slot Attention using a novel Probabilistic +Slot Dictionary (PSD). We define PSD with (i) abstract object-level property +vectors as key and (ii) parametric Gaussian distribution as its corresponding +value. We demonstrate the benefits of the learnt specific object-level +conditioning distributions in multiple downstream tasks, namely object +discovery, compositional scene generation, and compositional visual reasoning. +We show that our method provides scene composition capabilities and a +significant boost in a few shot adaptability tasks of compositional visual +reasoning, while performing similarly or better than slot attention in object +discovery tasks + +
+
+
+
+
+ + ☆ Measuring Student Behavioral Engagement using Histogram of Actions + + +
+ In this paper, we propose a novel technique for measuring behavioral +engagement through students' actions recognition. The proposed approach +recognizes student actions then predicts the student behavioral engagement +level. For student action recognition, we use human skeletons to model student +postures and upper body movements. To learn the dynamics of student upper body, +a 3D-CNN model is used. The trained 3D-CNN model is used to recognize actions +within every 2minute video segment then these actions are used to build a +histogram of actions which encodes the student actions and their frequencies. +This histogram is utilized as an input to SVM classifier to classify whether +the student is engaged or disengaged. To evaluate the proposed framework, we +build a dataset consisting of 1414 2-minute video segments annotated with 13 +actions and 112 video segments annotated with two engagement levels. +Experimental results indicate that student actions can be recognized with top 1 +accuracy 83.63% and the proposed framework can capture the average engagement +of the class. + +
+
+
+
+
+ + ☆ Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation + Evaluation + + +
+ Research in Image Generation has recently made significant progress, +particularly boosted by the introduction of Vision-Language models which are +able to produce high-quality visual content based on textual inputs. Despite +ongoing advancements in terms of generation quality and realism, no methodical +frameworks have been defined yet to quantitatively measure the quality of the +generated content and the adherence with the prompted requests: so far, only +human-based evaluations have been adopted for quality satisfaction and for +comparing different generative methods. We introduce a novel automated method +for Visual Concept Evaluation (ViCE), i.e. to assess consistency between a +generated/edited image and the corresponding prompt/instructions, with a +process inspired by the human cognitive behaviour. ViCE combines the strengths +of Large Language Models (LLMs) and Visual Question Answering (VQA) into a +unified pipeline, aiming to replicate the human cognitive process in quality +assessment. This method outlines visual concepts, formulates image-specific +verification questions, utilizes the Q&A system to investigate the image, and +scores the combined outcome. Although this brave new hypothesis of mimicking +humans in the image evaluation process is in its preliminary assessment stage, +results are promising and open the door to a new form of automatic evaluation +which could have significant impact as the image generation or the image target +editing tasks become more and more sophisticated. + +
+
+
+
+
+ + ☆ Plug the Leaks: Advancing Audio-driven Talking Face Generation by + Preventing Unintended Information Flow ICCV 2023 + + +
+ Audio-driven talking face generation is the task of creating a +lip-synchronized, realistic face video from given audio and reference frames. +This involves two major challenges: overall visual quality of generated images +on the one hand, and audio-visual synchronization of the mouth part on the +other hand. In this paper, we start by identifying several problematic aspects +of synchronization methods in recent audio-driven talking face generation +approaches. Specifically, this involves unintended flow of lip and pose +information from the reference to the generated image, as well as instabilities +during model training. Subsequently, we propose various techniques for +obviating these issues: First, a silent-lip reference image generator prevents +leaking of lips from the reference to the generated image. Second, an adaptive +triplet loss handles the pose leaking problem. Finally, we propose a stabilized +formulation of synchronization loss, circumventing aforementioned training +instabilities while additionally further alleviating the lip leaking issue. +Combining the individual improvements, we present state-of-the art performance +on LRS2 and LRW in both synchronization and visual quality. We further validate +our design in various ablation experiments, confirming the individual +contributions as well as their complementary effects. + +
+
+ comment: Submitted to ICCV 2023 +
+
+
+
+
+ + ☆ An Evaluation of Zero-Cost Proxies -- from Neural Architecture + Performance to Model Robustness + + +
+ Zero-cost proxies are nowadays frequently studied and used to search for +neural architectures. They show an impressive ability to predict the +performance of architectures by making use of their untrained weights. These +techniques allow for immense search speed-ups. So far the joint search for +well-performing and robust architectures has received much less attention in +the field of NAS. Therefore, the main focus of zero-cost proxies is the clean +accuracy of architectures, whereas the model robustness should play an evenly +important part. In this paper, we analyze the ability of common zero-cost +proxies to serve as performance predictors for robustness in the popular +NAS-Bench-201 search space. We are interested in the single prediction task for +robustness and the joint multi-objective of clean and robust accuracy. We +further analyze the feature importance of the proxies and show that predicting +the robustness makes the prediction task from existing zero-cost proxies more +challenging. As a result, the joint consideration of several proxies becomes +necessary to predict a model's robustness while the clean accuracy can be +regressed from a single such feature. + +
+
+ comment: Accepted at DAGM GCPR 2023 +
+
+
+
+
+ + ☆ Disentangle then Parse:Night-time Semantic Segmentation with + Illumination Disentanglement ICCV2023 + + +
+ Most prior semantic segmentation methods have been developed for day-time +scenes, while typically underperforming in night-time scenes due to +insufficient and complicated lighting conditions. In this work, we tackle this +challenge by proposing a novel night-time semantic segmentation paradigm, i.e., +disentangle then parse (DTP). DTP explicitly disentangles night-time images +into light-invariant reflectance and light-specific illumination components and +then recognizes semantics based on their adaptive fusion. Concretely, the +proposed DTP comprises two key components: 1) Instead of processing +lighting-entangled features as in prior works, our Semantic-Oriented +Disentanglement (SOD) framework enables the extraction of reflectance component +without being impeded by lighting, allowing the network to consistently +recognize the semantics under cover of varying and complicated lighting +conditions. 2) Based on the observation that the illumination component can +serve as a cue for some semantically confused regions, we further introduce an +Illumination-Aware Parser (IAParser) to explicitly learn the correlation +between semantics and lighting, and aggregate the illumination features to +yield more precise predictions. Extensive experiments on the night-time +segmentation task with various settings demonstrate that DTP significantly +outperforms state-of-the-art methods. Furthermore, with negligible additional +parameters, DTP can be directly used to benefit existing day-time methods for +night-time segmentation. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ MOCA: Self-supervised Representation Learning by Predicting Masked + Online Codebook Assignments + + +
+ Self-supervised learning can be used for mitigating the greedy needs of +Vision Transformer networks for very large fully-annotated datasets. Different +classes of self-supervised learning offer representations with either good +contextual reasoning properties, e.g., using masked image modeling strategies, +or invariance to image perturbations, e.g., with contrastive methods. In this +work, we propose a single-stage and standalone method, MOCA, which unifies both +desired properties using novel mask-and-predict objectives defined with +high-level features (instead of pixel-level details). Moreover, we show how to +effectively employ both learning paradigms in a synergistic and +computation-efficient way. Doing so, we achieve new state-of-the-art results on +low-shot settings and strong experimental results in various evaluation +protocols with a training that is at least 3 times faster than prior methods. + +
+
+
+
+
+ + ☆ OnlineRefer: A Simple Online Baseline for Referring Video Object + Segmentation ICCV2023 + + +
+ Referring video object segmentation (RVOS) aims at segmenting an object in a +video following human instruction. Current state-of-the-art methods fall into +an offline pattern, in which each clip independently interacts with text +embedding for cross-modal understanding. They usually present that the offline +pattern is necessary for RVOS, yet model limited temporal association within +each clip. In this work, we break up the previous offline belief and propose a +simple yet effective online model using explicit query propagation, named +OnlineRefer. Specifically, our approach leverages target cues that gather +semantic information and position prior to improve the accuracy and ease of +referring predictions for the current frame. Furthermore, we generalize our +online model into a semi-online framework to be compatible with video-based +backbones. To show the effectiveness of our method, we evaluate it on four +benchmarks, \ie, Refer-Youtube-VOS, Refer-DAVIS17, A2D-Sentences, and +JHMDB-Sentences. Without bells and whistles, our OnlineRefer with a Swin-L +backbone achieves 63.5 J&F and 64.8 J&F on Refer-Youtube-VOS and Refer-DAVIS17, +outperforming all other offline methods. + +
+
+ comment: Accepted by ICCV2023. The code is at + https://github.com/wudongming97/OnlineRefer +
+
+
+
+
+ + ☆ SphereNet: Learning a Noise-Robust and General Descriptor for Point + Cloud Registration + + +
+ Point cloud registration is to estimate a transformation to align point +clouds collected in different perspectives. In learning-based point cloud +registration, a robust descriptor is vital for high-accuracy registration. +However, most methods are susceptible to noise and have poor generalization +ability on unseen datasets. Motivated by this, we introduce SphereNet to learn +a noise-robust and unseen-general descriptor for point cloud registration. In +our method, first, the spheroid generator builds a geometric domain based on +spherical voxelization to encode initial features. Then, the spherical +interpolation of the sphere is introduced to realize robustness against noise. +Finally, a new spherical convolutional neural network with spherical integrity +padding completes the extraction of descriptors, which reduces the loss of +features and fully captures the geometric features. To evaluate our methods, a +new benchmark 3DMatch-noise with strong noise is introduced. Extensive +experiments are carried out on both indoor and outdoor datasets. Under +high-intensity noise, SphereNet increases the feature matching recall by more +than 25 percentage points on 3DMatch-noise. In addition, it sets a new +state-of-the-art performance for the 3DMatch and 3DLoMatch benchmarks with +93.5\% and 75.6\% registration recall and also has the best generalization +ability on unseen datasets. + +
+
+ comment: 15 pages, under review for IEEE Transactions on Circuits and Systems + for Video Technology +
+
+
+
+
+ + ☆ Visual Validation versus Visual Estimation: A Study on the Average Value + in Scatterplots + + +
+ We investigate the ability of individuals to visually validate statistical +models in terms of their fit to the data. While visual model estimation has +been studied extensively, visual model validation remains under-investigated. +It is unknown how well people are able to visually validate models, and how +their performance compares to visual and computational estimation. As a +starting point, we conducted a study across two populations (crowdsourced and +volunteers). Participants had to both visually estimate (i.e, draw) and +visually validate (i.e., accept or reject) the frequently studied model of +averages. Across both populations, the level of accuracy of the models that +were considered valid was lower than the accuracy of the estimated models. We +find that participants' validation and estimation were unbiased. Moreover, +their natural critical point between accepting and rejecting a given mean value +is close to the boundary of its 95% confidence interval, indicating that the +visually perceived confidence interval corresponds to a common statistical +standard. Our work contributes to the understanding of visual model validation +and opens new research opportunities. + +
+
+ comment: Preprint and Author Version of a Short Paper, accepted to the 2023 + IEEE Visualization Conference (VIS) +
+
+
+
+
+ + ☆ Towards a performance analysis on pre-trained Visual Question Answering + models for autonomous driving + + +
+ This short paper presents a preliminary analysis of three popular Visual +Question Answering (VQA) models, namely ViLBERT, ViLT, and LXMERT, in the +context of answering questions relating to driving scenarios. The performance +of these models is evaluated by comparing the similarity of responses to +reference answers provided by computer vision experts. Model selection is +predicated on the analysis of transformer utilization in multimodal +architectures. The results indicate that models incorporating cross-modal +attention and late fusion techniques exhibit promising potential for generating +improved answers within a driving perspective. This initial analysis serves as +a launchpad for a forthcoming comprehensive comparative study involving nine +VQA models and sets the scene for further investigations into the effectiveness +of VQA model queries in self-driving scenarios. Supplementary material is +available at +https://github.com/KaavyaRekanar/Towards-a-performance-analysis-on-pre-trained-VQA-models-for-autonomous-driving. + +
+
+
+
+
+ + ☆ Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking + Portrait Synthesis ICCV 2023 + + +
+ This paper presents ER-NeRF, a novel conditional Neural Radiance Fields +(NeRF) based architecture for talking portrait synthesis that can concurrently +achieve fast convergence, real-time rendering, and state-of-the-art performance +with small model size. Our idea is to explicitly exploit the unequal +contribution of spatial regions to guide talking portrait modeling. +Specifically, to improve the accuracy of dynamic head reconstruction, a compact +and expressive NeRF-based Tri-Plane Hash Representation is introduced by +pruning empty spatial regions with three planar hash encoders. For speech +audio, we propose a Region Attention Module to generate region-aware condition +feature via an attention mechanism. Different from existing methods that +utilize an MLP-based encoder to learn the cross-modal relation implicitly, the +attention mechanism builds an explicit connection between audio features and +spatial regions to capture the priors of local motions. Moreover, a direct and +fast Adaptive Pose Encoding is introduced to optimize the head-torso separation +problem by mapping the complex transformation of the head pose into spatial +coordinates. Extensive experiments demonstrate that our method renders better +high-fidelity and audio-lips synchronized talking portrait videos, with +realistic details and high efficiency compared to previous methods. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ MarS3D: A Plug-and-Play Motion-Aware Model for Semantic Segmentation on + Multi-Scan 3D Point Clouds + + +
+ 3D semantic segmentation on multi-scan large-scale point clouds plays an +important role in autonomous systems. Unlike the single-scan-based semantic +segmentation task, this task requires distinguishing the motion states of +points in addition to their semantic categories. However, methods designed for +single-scan-based segmentation tasks perform poorly on the multi-scan task due +to the lacking of an effective way to integrate temporal information. We +propose MarS3D, a plug-and-play motion-aware module for semantic segmentation +on multi-scan 3D point clouds. This module can be flexibly combined with +single-scan models to allow them to have multi-scan perception abilities. The +model encompasses two key designs: the Cross-Frame Feature Embedding module for +enriching representation learning and the Motion-Aware Feature Learning module +for enhancing motion awareness. Extensive experiments show that MarS3D can +improve the performance of the baseline model by a large margin. The code is +available at https://github.com/CVMI-Lab/MarS3D. + +
+
+
+
+
+ + ☆ EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory + Forecasting ICCV 2023 + + +
+ Capturing high-dimensional social interactions and feasible futures is +essential for predicting trajectories. To address this complex nature, several +attempts have been devoted to reducing the dimensionality of the output +variables via parametric curve fitting such as the B\'ezier curve and B-spline +function. However, these functions, which originate in computer graphics +fields, are not suitable to account for socially acceptable human dynamics. In +this paper, we present EigenTrajectory ($\mathbb{ET}$), a trajectory prediction +approach that uses a novel trajectory descriptor to form a compact space, known +here as $\mathbb{ET}$ space, in place of Euclidean space, for representing +pedestrian movements. We first reduce the complexity of the trajectory +descriptor via a low-rank approximation. We transform the pedestrians' history +paths into our $\mathbb{ET}$ space represented by spatio-temporal principle +components, and feed them into off-the-shelf trajectory forecasting models. The +inputs and outputs of the models as well as social interactions are all +gathered and aggregated in the corresponding $\mathbb{ET}$ space. Lastly, we +propose a trajectory anchor-based refinement method to cover all possible +futures in the proposed $\mathbb{ET}$ space. Extensive experiments demonstrate +that our EigenTrajectory predictor can significantly improve both the +prediction accuracy and reliability of existing trajectory forecasting models +on public benchmarks, indicating that the proposed descriptor is suited to +represent pedestrian behaviors. Code is publicly available at +https://github.com/inhwanbae/EigenTrajectory . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Conformal prediction under ambiguous ground truth + + +
+ In safety-critical classification tasks, conformal prediction allows to +perform rigorous uncertainty quantification by providing confidence sets +including the true class with a user-specified probability. This generally +assumes the availability of a held-out calibration set with access to ground +truth labels. Unfortunately, in many domains, such labels are difficult to +obtain and usually approximated by aggregating expert opinions. In fact, this +holds true for almost all datasets, including well-known ones such as CIFAR and +ImageNet. Applying conformal prediction using such labels underestimates +uncertainty. Indeed, when expert opinions are not resolvable, there is inherent +ambiguity present in the labels. That is, we do not have ``crisp'', definitive +ground truth labels and this uncertainty should be taken into account during +calibration. In this paper, we develop a conformal prediction framework for +such ambiguous ground truth settings which relies on an approximation of the +underlying posterior distribution of labels given inputs. We demonstrate our +methodology on synthetic and real datasets, including a case study of skin +condition classification in dermatology. + +
+
+
+
+
+ + ☆ RepViT: Revisiting Mobile CNN From ViT Perspective + + +
+ Recently, lightweight Vision Transformers (ViTs) demonstrate superior +performance and lower latency compared with lightweight Convolutional Neural +Networks (CNNs) on resource-constrained mobile devices. This improvement is +usually attributed to the multi-head self-attention module, which enables the +model to learn global representations. However, the architectural disparities +between lightweight ViTs and lightweight CNNs have not been adequately +examined. In this study, we revisit the efficient design of lightweight CNNs +and emphasize their potential for mobile devices. We incrementally enhance the +mobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by +integrating the efficient architectural choices of lightweight ViTs. This ends +up with a new family of pure lightweight CNNs, namely RepViT. Extensive +experiments show that RepViT outperforms existing state-of-the-art lightweight +ViTs and exhibits favorable latency in various vision tasks. On ImageNet, +RepViT achieves over 80\% top-1 accuracy with nearly 1ms latency on an iPhone +12, which is the first time for a lightweight model, to the best of our +knowledge. Our largest model, RepViT-M3, obtains 81.4\% accuracy with only +1.3ms latency. The code and trained models are available at +\url{https://github.com/jameslahm/RepViT}. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Regression-free Blind Image Quality Assessment + + +
+ Regression-based blind image quality assessment (IQA) models are susceptible +to biased training samples, leading to a biased estimation of model parameters. +To mitigate this issue, we propose a regression-free framework for image +quality evaluation, which is founded upon retrieving similar instances by +incorporating semantic and distortion features. The motivation behind this +approach is rooted in the observation that the human visual system (HVS) has +analogous visual responses to semantically similar image contents degraded by +the same distortion. The proposed framework comprises two classification-based +modules: semantic-based classification (SC) module and distortion-based +classification (DC) module. Given a test image and an IQA database, the SC +module retrieves multiple pristine images based on semantic similarity. The DC +module then retrieves instances based on distortion similarity from the +distorted images that correspond to each retrieved pristine image. Finally, the +predicted quality score is derived by aggregating the subjective quality scores +of multiple retrieved instances. Experimental results on four benchmark +databases validate that the proposed model can remarkably outperform the +state-of-the-art regression-based models. + +
+
+ comment: 11 pages, 7 figures, 50 conferences +
+
+
+
+
+ + ☆ Distilling Coarse-to-Fine Semantic Matching Knowledge for Weakly + Supervised 3D Visual Grounding ICCV2023 + + +
+ 3D visual grounding involves finding a target object in a 3D scene that +corresponds to a given sentence query. Although many approaches have been +proposed and achieved impressive performance, they all require dense +object-sentence pair annotations in 3D point clouds, which are both +time-consuming and expensive. To address the problem that fine-grained +annotated data is difficult to obtain, we propose to leverage weakly supervised +annotations to learn the 3D visual grounding model, i.e., only coarse +scene-sentence correspondences are used to learn object-sentence links. To +accomplish this, we design a novel semantic matching model that analyzes the +semantic similarity between object proposals and sentences in a coarse-to-fine +manner. Specifically, we first extract object proposals and coarsely select the +top-K candidates based on feature and class similarity matrices. Next, we +reconstruct the masked keywords of the sentence using each candidate one by +one, and the reconstructed accuracy finely reflects the semantic similarity of +each candidate to the query. Additionally, we distill the coarse-to-fine +semantic matching knowledge into a typical two-stage 3D visual grounding model, +which reduces inference costs and improves performance by taking full advantage +of the well-studied structure of the existing architectures. We conduct +extensive experiments on ScanRefer, Nr3D, and Sr3D, which demonstrate the +effectiveness of our proposed method. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ Knowledge Distillation for Object Detection: from generic to remote + sensing datasets + + +
+ Knowledge distillation, a well-known model compression technique, is an +active research area in both computer vision and remote sensing communities. In +this paper, we evaluate in a remote sensing context various off-the-shelf +object detection knowledge distillation methods which have been originally +developed on generic computer vision datasets such as Pascal VOC. In +particular, methods covering both logit mimicking and feature imitation +approaches are applied for vehicle detection using the well-known benchmarks +such as xView and VEDAI datasets. Extensive experiments are performed to +compare the relative performance and interrelationships of the methods. +Experimental results show high variations and confirm the importance of result +aggregation and cross validation on remote sensing datasets. + +
+
+ comment: Accepted for publishing at IGARSS 2023 +
+
+
+
+
+ + ☆ Neuromorphic spintronics simulated using an unconventional data-driven + Thiele equation approach SC + + +
+ In this study, we developed a quantitative description of the dynamics of +spin-torque vortex nano-oscillators (STVOs) through an unconventional model +based on the combination of the Thiele equation approach (TEA) and data from +micromagnetic simulations (MMS). Solving the STVO dynamics with our analytical +model allows to accelerate the simulations by 9 orders of magnitude compared to +MMS while reaching the same level of accuracy. Here, we showcase our model by +simulating a STVO-based neural network for solving a classification task. We +assess its performance with respect to the input signal current intensity and +the level of noise that might affect such a system. Our approach is promising +for accelerating the design of STVO-based neuromorphic computing devices while +decreasing drastically its computational cost. + +
+
+ comment: Presented in ISCS2023 +
+
+
+
+
+ + ☆ Adaptive Topological Feature via Persistent Homology: Filtration + Learning for Point Clouds + + +
+ Machine learning for point clouds has been attracting much attention, with +many applications in various fields, such as shape recognition and material +science. To enhance the accuracy of such machine learning methods, it is known +to be effective to incorporate global topological features, which are typically +extracted by persistent homology. In the calculation of persistent homology for +a point cloud, we need to choose a filtration for the point clouds, an +increasing sequence of spaces. Because the performance of machine learning +methods combined with persistent homology is highly affected by the choice of a +filtration, we need to tune it depending on data and tasks. In this paper, we +propose a framework that learns a filtration adaptively with the use of neural +networks. In order to make the resulting persistent homology +isometry-invariant, we develop a neural network architecture with such +invariance. Additionally, we theoretically show a finite-dimensional +approximation result that justifies our architecture. Experimental results +demonstrated the efficacy of our framework in several classification tasks. + +
+
+ comment: 17 pages with 4 figures +
+
+
+
+
+ + ☆ Generation of High Spatial Resolution Terrestrial Surface from Low + Spatial Resolution Elevation Contour Maps via Hierarchical Computation of + Median Elevation Regions + + +
+ We proposed a simple yet effective morphological approach to convert a sparse +Digital Elevation Model (DEM) to a dense Digital Elevation Model. The +conversion is similar to that of the generation of high-resolution DEM from its +low-resolution DEM. The approach involves the generation of median contours to +achieve the purpose. It is a sequential step of the I) decomposition of the +existing sparse Contour map into the maximum possible Threshold Elevation +Region (TERs). II) Computing all possible non-negative and non-weighted Median +Elevation Region (MER) hierarchically between the successive TER decomposed +from a sparse contour map. III) Computing the gradient of all TER, and MER +computed from previous steps would yield the predicted intermediate elevation +contour at a higher spatial resolution. We presented this approach initially +with some self-made synthetic data to show how the contour prediction works and +then experimented with the available contour map of Washington, NH to justify +its usefulness. This approach considers the geometric information of existing +contours and interpolates the elevation contour at a new spatial region of a +topographic surface until no elevation contours are necessary to generate. This +novel approach is also very low-cost and robust as it uses elevation contours. + +
+
+ comment: 11 pages, 6 figures,1 table, 1 algorithm +
+
+
+
+
+ + ☆ Fusing Hand and Body Skeletons for Human Action Recognition in Assembly ICANN + + +
+ As collaborative robots (cobots) continue to gain popularity in industrial +manufacturing, effective human-robot collaboration becomes crucial. Cobots +should be able to recognize human actions to assist with assembly tasks and act +autonomously. To achieve this, skeleton-based approaches are often used due to +their ability to generalize across various people and environments. Although +body skeleton approaches are widely used for action recognition, they may not +be accurate enough for assembly actions where the worker's fingers and hands +play a significant role. To address this limitation, we propose a method in +which less detailed body skeletons are combined with highly detailed hand +skeletons. We investigate CNNs and transformers, the latter of which are +particularly adept at extracting and combining important information from both +skeleton types using attention. This paper demonstrates the effectiveness of +our proposed approach in enhancing action recognition in assembly scenarios. + +
+
+ comment: International Conference on Artificial Neural Networks (ICANN) 2023 +
+
+
+
+
+ + ☆ Augmenting CLIP with Improved Visio-Linguistic Reasoning + + +
+ Image-text contrastive models such as CLIP are useful for a variety of +downstream applications including zero-shot classification, image-text +retrieval and transfer learning. However, these contrastively trained +vision-language models often fail on compositional visio-linguistic tasks such +as Winoground with performance equivalent to random chance. In our paper, we +address this issue and propose a sample-efficient light-weight method called +SDS-CLIP to improve the compositional visio-linguistic reasoning capabilities +of CLIP. The core idea of our method is to use differentiable image +parameterizations to fine-tune CLIP with a distillation objective from large +text-to-image generative models such as Stable-Diffusion which are relatively +good at visio-linguistic reasoning tasks. On the challenging Winoground +compositional reasoning benchmark, our method improves the absolute +visio-linguistic performance of different CLIP models by up to 7%, while on the +ARO dataset, our method improves the visio-linguistic performance by upto 3%. +As a byproduct of inducing visio-linguistic reasoning into CLIP, we also find +that the zero-shot performance improves marginally on a variety of downstream +datasets. Our method reinforces that carefully designed distillation objectives +from generative models can be leveraged to extend existing contrastive +image-text models with improved visio-linguistic reasoning capabilities. + +
+
+
+
+
+ + ☆ A Survey on Open-Vocabulary Detection and Segmentation: Past, Present, + and Future + + +
+ As the most fundamental tasks of computer vision, object detection and +segmentation have made tremendous progress in the deep learning era. Due to the +expensive manual labeling, the annotated categories in existing datasets are +often small-scale and pre-defined, i.e., state-of-the-art detectors and +segmentors fail to generalize beyond the closed-vocabulary. To resolve this +limitation, the last few years have witnessed increasing attention toward +Open-Vocabulary Detection (OVD) and Segmentation (OVS). In this survey, we +provide a comprehensive review on the past and recent development of OVD and +OVS. To this end, we develop a taxonomy according to the type of task and +methodology. We find that the permission and usage of weak supervision signals +can well discriminate different methodologies, including: visual-semantic space +mapping, novel visual feature synthesis, region-aware training, +pseudo-labeling, knowledge distillation-based, and transfer learning-based. The +proposed taxonomy is universal across different tasks, covering object +detection, semantic/instance/panoptic segmentation, 3D scene and video +understanding. In each category, its main principles, key challenges, +development routes, strengths, and weaknesses are thoroughly discussed. In +addition, we benchmark each task along with the vital components of each +method. Finally, several promising directions are provided to stimulate future +research. + +
+
+
+
+
+ + ☆ You've Got Two Teachers: Co-evolutionary Image and Report Distillation + for Semi-supervised Anatomical Abnormality Detection in Chest X-ray + + +
+ Chest X-ray (CXR) anatomical abnormality detection aims at localizing and +characterising cardiopulmonary radiological findings in the radiographs, which +can expedite clinical workflow and reduce observational oversights. Most +existing methods attempted this task in either fully supervised settings which +demanded costly mass per-abnormality annotations, or weakly supervised settings +which still lagged badly behind fully supervised methods in performance. In +this work, we propose a co-evolutionary image and report distillation (CEIRD) +framework, which approaches semi-supervised abnormality detection in CXR by +grounding the visual detection results with text-classified abnormalities from +paired radiology reports, and vice versa. Concretely, based on the classical +teacher-student pseudo label distillation (TSD) paradigm, we additionally +introduce an auxiliary report classification model, whose prediction is used +for report-guided pseudo detection label refinement (RPDLR) in the primary +vision detection task. Inversely, we also use the prediction of the vision +detection model for abnormality-guided pseudo classification label refinement +(APCLR) in the auxiliary report classification task, and propose a co-evolution +strategy where the vision and report models mutually promote each other with +RPDLR and APCLR performed alternatively. To this end, we effectively +incorporate the weak supervision by reports into the semi-supervised TSD +pipeline. Besides the cross-modal pseudo label refinement, we further propose +an intra-image-modal self-adaptive non-maximum suppression, where the pseudo +detection labels generated by the teacher vision model are dynamically +rectified by high-confidence predictions by the student. Experimental results +on the public MIMIC-CXR benchmark demonstrate CEIRD's superior performance to +several up-to-date weakly and semi-supervised methods. + +
+
+
+
+
+ + ☆ Pixel-wise Graph Attention Networks for Person Re-identification + + +
+ Graph convolutional networks (GCN) is widely used to handle irregular data +since it updates node features by using the structure information of graph. +With the help of iterated GCN, high-order information can be obtained to +further enhance the representation of nodes. However, how to apply GCN to +structured data (such as pictures) has not been deeply studied. In this paper, +we explore the application of graph attention networks (GAT) in image feature +extraction. First of all, we propose a novel graph generation algorithm to +convert images into graphs through matrix transformation. It is one magnitude +faster than the algorithm based on K Nearest Neighbors (KNN). Then, GAT is used +on the generated graph to update the node features. Thus, a more robust +representation is obtained. These two steps are combined into a module called +pixel-wise graph attention module (PGA). Since the graph obtained by our graph +generation algorithm can still be transformed into a picture after processing, +PGA can be well combined with CNN. Based on these two modules, we consulted the +ResNet and design a pixel-wise graph attention network (PGANet). The PGANet is +applied to the task of person re-identification in the datasets Market1501, +DukeMTMC-reID and Occluded-DukeMTMC (outperforms state-of-the-art by 0.8\%, +1.1\% and 11\% respectively, in mAP scores). Experiment results show that it +achieves the state-of-the-art performance. +\href{https://github.com/wenyu1009/PGANet}{The code is available here}. + +
+
+
+
+
+ + ☆ Jean-Luc Picard at Touché 2023: Comparing Image Generation, Stance + Detection and Feature Matching for Image Retrieval for Arguments + + +
+ Participating in the shared task "Image Retrieval for arguments", we used +different pipelines for image retrieval containing Image Generation, Stance +Detection, Preselection and Feature Matching. We submitted four different runs +with different pipeline layout and compare them to given baseline. Our +pipelines perform similarly to the baseline. + +
+
+ comment: 7 pages, 1 figure, 1 table, conference: CLEF +
+
+
+
+
+ + ☆ Towards Trustworthy Dataset Distillation + + +
+ Efficiency and trustworthiness are two eternal pursuits when applying deep +learning in real-world applications. With regard to efficiency, dataset +distillation (DD) endeavors to reduce training costs by distilling the large +dataset into a tiny synthetic dataset. However, existing methods merely +concentrate on in-distribution (InD) classification in a closed-world setting, +disregarding out-of-distribution (OOD) samples. On the other hand, OOD +detection aims to enhance models' trustworthiness, which is always +inefficiently achieved in full-data settings. For the first time, we +simultaneously consider both issues and propose a novel paradigm called +Trustworthy Dataset Distillation (TrustDD). By distilling both InD samples and +outliers, the condensed datasets are capable to train models competent in both +InD classification and OOD detection. To alleviate the requirement of real +outlier data and make OOD detection more practical, we further propose to +corrupt InD samples to generate pseudo-outliers and introduce Pseudo-Outlier +Exposure (POE). Comprehensive experiments on various settings demonstrate the +effectiveness of TrustDD, and the proposed POE surpasses state-of-the-art +method Outlier Exposure (OE). Compared with the preceding DD, TrustDD is more +trustworthy and applicable to real open-world scenarios. Our code will be +publicly available. + +
+
+ comment: 20 pages, 20 figures +
+
+
+
+
+ + ☆ CG-fusion CAM: Online segmentation of laser-induced damage on + large-aperture optics + + +
+ Online segmentation of laser-induced damage on large-aperture optics in +high-power laser facilities is challenged by complicated damage morphology, +uneven illumination and stray light interference. Fully supervised semantic +segmentation algorithms have achieved state-of-the-art performance, but rely on +plenty of pixel-level labels, which are time-consuming and labor-consuming to +produce. LayerCAM, an advanced weakly supervised semantic segmentation +algorithm, can generate pixel-accurate results using only image-level labels, +but its scattered and partially under-activated class activation regions +degrade segmentation performance. In this paper, we propose a weakly supervised +semantic segmentation method with Continuous Gradient CAM and its nonlinear +multi-scale fusion (CG-fusion CAM). The method redesigns the way of +back-propagating gradients and non-linearly activates the multi-scale fused +heatmaps to generate more fine-grained class activation maps with appropriate +activation degree for different sizes of damage sites. Experiments on our +dataset show that the proposed method can achieve segmentation performance +comparable to that of fully supervised algorithms. + +
+
+
+
+
+ + ☆ Constraining Depth Map Geometry for Multi-View Stereo: A Dual-Depth + Approach with Saddle-shaped Depth Cells ICCV 2023 + + +
+ Learning-based multi-view stereo (MVS) methods deal with predicting accurate +depth maps to achieve an accurate and complete 3D representation. Despite the +excellent performance, existing methods ignore the fact that a suitable depth +geometry is also critical in MVS. In this paper, we demonstrate that different +depth geometries have significant performance gaps, even using the same depth +prediction error. Therefore, we introduce an ideal depth geometry composed of +Saddle-Shaped Cells, whose predicted depth map oscillates upward and downward +around the ground-truth surface, rather than maintaining a continuous and +smooth depth plane. To achieve it, we develop a coarse-to-fine framework called +Dual-MVSNet (DMVSNet), which can produce an oscillating depth plane. +Technically, we predict two depth values for each pixel (Dual-Depth), and +propose a novel loss function and a checkerboard-shaped selecting strategy to +constrain the predicted depth geometry. Compared to existing methods,DMVSNet +achieves a high rank on the DTU benchmark and obtains the top performance on +challenging scenes of Tanks and Temples, demonstrating its strong performance +and generalization ability. Our method also points to a new research direction +for considering depth geometry in MVS. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Class-relation Knowledge Distillation for Novel Class Discovery ICCV2023 + + +
+ We tackle the problem of novel class discovery, which aims to learn novel +classes without supervision based on labeled data from known classes. A key +challenge lies in transferring the knowledge in the known-class data to the +learning of novel classes. Previous methods mainly focus on building a shared +representation space for knowledge transfer and often ignore modeling class +relations. To address this, we introduce a class relation representation for +the novel classes based on the predicted class distribution of a model trained +on known classes. Empirically, we find that such class relation becomes less +informative during typical discovery training. To prevent such information +loss, we propose a novel knowledge distillation framework, which utilizes our +class-relation representation to regularize the learning of novel classes. In +addition, to enable a flexible knowledge distillation scheme for each data +point in novel classes, we develop a learnable weighting function for the +regularization, which adaptively promotes knowledge transfer based on the +semantic similarity between the novel and known classes. To validate the +effectiveness and generalization of our method, we conduct extensive +experiments on multiple benchmarks, including CIFAR100, Stanford Cars, CUB, and +FGVC-Aircraft datasets. Our results demonstrate that the proposed method +outperforms the previous state-of-the-art methods by a significant margin on +almost all benchmarks. Code is available at +\href{https://github.com/kleinzcy/Cr-KD-NCD}{here}. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ MLF-DET: Multi-Level Fusion for Cross-Modal 3D Object Detection + + +
+ In this paper, we propose a novel and effective Multi-Level Fusion network, +named as MLF-DET, for high-performance cross-modal 3D object DETection, which +integrates both the feature-level fusion and decision-level fusion to fully +utilize the information in the image. For the feature-level fusion, we present +the Multi-scale Voxel Image fusion (MVI) module, which densely aligns +multi-scale voxel features with image features. For the decision-level fusion, +we propose the lightweight Feature-cued Confidence Rectification (FCR) module +which further exploits image semantics to rectify the confidence of detection +candidates. Besides, we design an effective data augmentation strategy termed +Occlusion-aware GT Sampling (OGS) to reserve more sampled objects in the +training scenes, so as to reduce overfitting. Extensive experiments on the +KITTI dataset demonstrate the effectiveness of our method. Notably, on the +extremely competitive KITTI car 3D object detection benchmark, our method +reaches 82.89% moderate AP and achieves state-of-the-art performance without +bells and whistles. + +
+
+
+
+
+ + ☆ OPHAvatars: One-shot Photo-realistic Head Avatars + + +
+ We propose a method for synthesizing photo-realistic digital avatars from +only one portrait as the reference. Given a portrait, our method synthesizes a +coarse talking head video using driving keypoints features. And with the coarse +video, our method synthesizes a coarse talking head avatar with a deforming +neural radiance field. With rendered images of the coarse avatar, our method +updates the low-quality images with a blind face restoration model. With +updated images, we retrain the avatar for higher quality. After several +iterations, our method can synthesize a photo-realistic animatable 3D neural +head avatar. The motivation of our method is deformable neural radiance field +can eliminate the unnatural distortion caused by the image2video method. Our +method outperforms state-of-the-art methods in quantitative and qualitative +studies on various subjects. + +
+
+
+
+
+ + ☆ PRO-Face S: Privacy-preserving Reversible Obfuscation of Face Images via + Secure Flow + + +
+ This paper proposes a novel paradigm for facial privacy protection that +unifies multiple characteristics including anonymity, diversity, reversibility +and security within a single lightweight framework. We name it PRO-Face S, +short for Privacy-preserving Reversible Obfuscation of Face images via Secure +flow-based model. In the framework, an Invertible Neural Network (INN) is +utilized to process the input image along with its pre-obfuscated form, and +generate the privacy protected image that visually approximates to the +pre-obfuscated one, thus ensuring privacy. The pre-obfuscation applied can be +in diversified form with different strengths and styles specified by users. +Along protection, a secret key is injected into the network such that the +original image can only be recovered from the protection image via the same +model given the correct key provided. Two modes of image recovery are devised +to deal with malicious recovery attempts in different scenarios. Finally, +extensive experiments conducted on three public image datasets demonstrate the +superiority of the proposed framework over multiple state-of-the-art +approaches. + +
+
+
+
+
+ + ☆ MVA2023 Small Object Detection Challenge for Spotting Birds: Dataset, + Methods, and Results + + +
+ Small Object Detection (SOD) is an important machine vision topic because (i) +a variety of real-world applications require object detection for distant +objects and (ii) SOD is a challenging task due to the noisy, blurred, and +less-informative image appearances of small objects. This paper proposes a new +SOD dataset consisting of 39,070 images including 137,121 bird instances, which +is called the Small Object Detection for Spotting Birds (SOD4SB) dataset. The +detail of the challenge with the SOD4SB dataset is introduced in this paper. In +total, 223 participants joined this challenge. This paper briefly introduces +the award-winning methods. The dataset, the baseline code, and the website for +evaluation on the public testset are publicly available. + +
+
+ comment: This paper is included in the proceedings of the 18th International + Conference on Machine Vision Applications (MVA2023). It will be officially + published at a later date. Project page : + https://www.mva-org.jp/mva2023/challenge +
+
+
+
+
+ + ☆ DropMix: Reducing Class Dependency in Mixed Sample Data Augmentation + + +
+ Mixed sample data augmentation (MSDA) is a widely used technique that has +been found to improve performance in a variety of tasks. However, in this +paper, we show that the effects of MSDA are class-dependent, with some classes +seeing an improvement in performance while others experience a decline. To +reduce class dependency, we propose the DropMix method, which excludes a +specific percentage of data from the MSDA computation. By training on a +combination of MSDA and non-MSDA data, the proposed method not only improves +the performance of classes that were previously degraded by MSDA, but also +increases overall average accuracy, as shown in experiments on two datasets +(CIFAR-100 and ImageNet) using three MSDA methods (Mixup, CutMix and +PuzzleMix). + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ Light-Weight Vision Transformer with Parallel Local and Global + Self-Attention SC + + +
+ While transformer architectures have dominated computer vision in recent +years, these models cannot easily be deployed on hardware with limited +resources for autonomous driving tasks that require real-time-performance. +Their computational complexity and memory requirements limits their use, +especially for applications with high-resolution inputs. In our work, we +redesign the powerful state-of-the-art Vision Transformer PLG-ViT to a much +more compact and efficient architecture that is suitable for such tasks. We +identify computationally expensive blocks in the original PLG-ViT architecture +and propose several redesigns aimed at reducing the number of parameters and +floating-point operations. As a result of our redesign, we are able to reduce +PLG-ViT in size by a factor of 5, with a moderate drop in performance. We +propose two variants, optimized for the best trade-off between parameter count +to runtime as well as parameter count to accuracy. With only 5 million +parameters, we achieve 79.5$\%$ top-1 accuracy on the ImageNet-1K +classification benchmark. Our networks demonstrate great performance on general +vision benchmarks like COCO instance segmentation. In addition, we conduct a +series of experiments, demonstrating the potential of our approach in solving +various tasks specifically tailored to the challenges of autonomous driving and +transportation. + +
+
+ comment: This paper has been accepted at IEEE Intelligent Transportation + Systems Conference (ITSC), 2023 +
+
+
+
+
+ + ☆ NU-MCC: Multiview Compressive Coding with Neighborhood Decoder and + Repulsive UDF + + +
+ Remarkable progress has been made in 3D reconstruction from single-view RGB-D +inputs. MCC is the current state-of-the-art method in this field, which +achieves unprecedented success by combining vision Transformers with +large-scale training. However, we identified two key limitations of MCC: 1) The +Transformer decoder is inefficient in handling large number of query points; 2) +The 3D representation struggles to recover high-fidelity details. In this +paper, we propose a new approach called NU-MCC that addresses these +limitations. NU-MCC includes two key innovations: a Neighborhood decoder and a +Repulsive Unsigned Distance Function (Repulsive UDF). First, our Neighborhood +decoder introduces center points as an efficient proxy of input visual +features, allowing each query point to only attend to a small neighborhood. +This design not only results in much faster inference speed but also enables +the exploitation of finer-scale visual features for improved recovery of 3D +textures. Second, our Repulsive UDF is a novel alternative to the occupancy +field used in MCC, significantly improving the quality of 3D object +reconstruction. Compared to standard UDFs that suffer from holes in results, +our proposed Repulsive UDF can achieve more complete surface reconstruction. +Experimental results demonstrate that NU-MCC is able to learn a strong 3D +representation, significantly advancing the state of the art in single-view 3D +reconstruction. Particularly, it outperforms MCC by 9.7% in terms of the +F1-score on the CO3D-v2 dataset with more than 5x faster running speed. + +
+
+ comment: Project page: https://numcc.github.io/ +
+
+
+
+
+ + ☆ Mining of Single-Class by Active Learning for Semantic Segmentation + + +
+ Several Active Learning (AL) policies require retraining a target model +several times in order to identify the most informative samples and rarely +offer the option to focus on the acquisition of samples from underrepresented +classes. Here the Mining of Single-Class by Active Learning (MiSiCAL) paradigm +is introduced where an AL policy is constructed through deep reinforcement +learning and exploits quantity-accuracy correlations to build datasets on which +high-performance models can be trained with regards to specific classes. +MiSiCAL is especially helpful in the case of very large batch sizes since it +does not require repeated model training sessions as is common in other AL +methods. This is thanks to its ability to exploit fixed representations of the +candidate data points. We find that MiSiCAL is able to outperform a random +policy on 150 out of 171 COCO10k classes, while the strongest baseline only +outperforms random on 101 classes. + +
+
+ comment: 29 pages, 14 figures, 2 tables +
+
+
+
+
+ + ☆ Division Gets Better: Learning Brightness-Aware and Detail-Sensitive + Representations for Low-Light Image Enhancement + + +
+ Low-light image enhancement strives to improve the contrast, adjust the +visibility, and restore the distortion in color and texture. Existing methods +usually pay more attention to improving the visibility and contrast via +increasing the lightness of low-light images, while disregarding the +significance of color and texture restoration for high-quality images. Against +above issue, we propose a novel luminance and chrominance dual branch network, +termed LCDBNet, for low-light image enhancement, which divides low-light image +enhancement into two sub-tasks, e.g., luminance adjustment and chrominance +restoration. Specifically, LCDBNet is composed of two branches, namely +luminance adjustment network (LAN) and chrominance restoration network (CRN). +LAN takes responsibility for learning brightness-aware features leveraging +long-range dependency and local attention correlation. While CRN concentrates +on learning detail-sensitive features via multi-level wavelet decomposition. +Finally, a fusion network is designed to blend their learned features to +produce visually impressive images. Extensive experiments conducted on seven +benchmark datasets validate the effectiveness of our proposed LCDBNet, and the +results manifest that LCDBNet achieves superior performance in terms of +multiple reference/non-reference quality evaluators compared to other +state-of-the-art competitors. Our code and pretrained model will be available. + +
+
+ comment: 14 pages, 16 figures +
+
+
+
+
+ + ☆ A Survey on Multi-Objective Neural Architecture Search + + +
+ Recently, the expert-crafted neural architectures is increasing overtaken by +the utilization of neural architecture search (NAS) and automatic generation +(and tuning) of network structures which has a close relation to the +Hyperparameter Optimization and Auto Machine Learning (AutoML). After the +earlier NAS attempts to optimize only the prediction accuracy, Multi-Objective +Neural architecture Search (MONAS) has been attracting attentions which +considers more goals such as computational complexity, power consumption, and +size of the network for optimization, reaching a trade-off between the accuracy +and other features like the computational cost. In this paper, we present an +overview of principal and state-of-the-art works in the field of MONAS. +Starting from a well-categorized taxonomy and formulation for the NAS, we +address and correct some miscategorizations in previous surveys of the NAS +field. We also provide a list of all known objectives used and add a number of +new ones and elaborate their specifications. We have provides analyses about +the most important objectives and shown that the stochastic properties of some +the them should be differed from deterministic ones in the multi-objective +optimization procedure of NAS. We finalize this paper with a number of future +directions and topics in the field of MONAS. + +
+
+ comment: 22 pages, 10 figures, 9 tables +
+
+
+
+
+ + ☆ PixelHuman: Animatable Neural Radiance Fields from Few Images + + +
+ In this paper, we propose PixelHuman, a novel human rendering model that +generates animatable human scenes from a few images of a person with unseen +identity, views, and poses. Previous work have demonstrated reasonable +performance in novel view and pose synthesis, but they rely on a large number +of images to train and are trained per scene from videos, which requires +significant amount of time to produce animatable scenes from unseen human +images. Our method differs from existing methods in that it can generalize to +any input image for animatable human synthesis. Given a random pose sequence, +our method synthesizes each target scene using a neural radiance field that is +conditioned on a canonical representation and pose-aware pixel-aligned +features, both of which can be obtained through deformation fields learned in a +data-driven manner. Our experiments show that our method achieves +state-of-the-art performance in multiview and novel pose synthesis from +few-shot images. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Evaluate Fine-tuning Strategies for Fetal Head Ultrasound Image + Segmentation with U-Net + + +
+ Fetal head segmentation is a crucial step in measuring the fetal head +circumference (HC) during gestation, an important biometric in obstetrics for +monitoring fetal growth. However, manual biometry generation is time-consuming +and results in inconsistent accuracy. To address this issue, convolutional +neural network (CNN) models have been utilized to improve the efficiency of +medical biometry. But training a CNN network from scratch is a challenging +task, we proposed a Transfer Learning (TL) method. Our approach involves +fine-tuning (FT) a U-Net network with a lightweight MobileNet as the encoder to +perform segmentation on a set of fetal head ultrasound (US) images with limited +effort. This method addresses the challenges associated with training a CNN +network from scratch. It suggests that our proposed FT strategy yields +segmentation performance that is comparable when trained with a reduced number +of parameters by 85.8%. And our proposed FT strategy outperforms other +strategies with smaller trainable parameter sizes below 4.4 million. Thus, we +contend that it can serve as a dependable FT approach for reducing the size of +models in medical image analysis. Our key findings highlight the importance of +the balance between model performance and size in developing Artificial +Intelligence (AI) applications by TL methods. Code is available at +https://github.com/13204942/FT_Methods_for_Fetal_Head_Segmentation. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ PatchCT: Aligning Patch Set and Label Set with Conditional Transport for + Multi-Label Image Classification ICCV23 + + +
+ Multi-label image classification is a prediction task that aims to identify +more than one label from a given image. This paper considers the semantic +consistency of the latent space between the visual patch and linguistic label +domains and introduces the conditional transport (CT) theory to bridge the +acknowledged gap. While recent cross-modal attention-based studies have +attempted to align such two representations and achieved impressive +performance, they required carefully-designed alignment modules and extra +complex operations in the attention computation. We find that by formulating +the multi-label classification as a CT problem, we can exploit the interactions +between the image and label efficiently by minimizing the bidirectional CT +cost. Specifically, after feeding the images and textual labels into the +modality-specific encoders, we view each image as a mixture of patch embeddings +and a mixture of label embeddings, which capture the local region features and +the class prototypes, respectively. CT is then employed to learn and align +those two semantic sets by defining the forward and backward navigators. +Importantly, the defined navigators in CT distance model the similarities +between patches and labels, which provides an interpretable tool to visualize +the learned prototypes. Extensive experiments on three public image benchmarks +show that the proposed model consistently outperforms the previous methods. Our +code is available at https://github.com/keepgoingjkg/PatchCT. + +
+
+ comment: accepted by ICCV23 +
+
+
+
+
+ + ☆ Learning Adaptive Neighborhoods for Graph Neural Networks ICCV 2023 + + +
+ Graph convolutional networks (GCNs) enable end-to-end learning on graph +structured data. However, many works assume a given graph structure. When the +input graph is noisy or unavailable, one approach is to construct or learn a +latent graph structure. These methods typically fix the choice of node degree +for the entire graph, which is suboptimal. Instead, we propose a novel +end-to-end differentiable graph generator which builds graph topologies where +each node selects both its neighborhood and its size. Our module can be readily +integrated into existing pipelines involving graph convolution operations, +replacing the predetermined or existing adjacency matrix with one that is +learned, and optimized, as part of the general objective. As such it is +applicable to any GCN. We integrate our module into trajectory prediction, +point cloud classification and node classification pipelines resulting in +improved accuracy over other structure-learning methods across a wide range of +datasets and GCN backbones. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Unleashing the Imagination of Text: A Novel Framework for Text-to-image + Person Retrieval via Exploring the Power of Words + + +
+ The goal of Text-to-image person retrieval is to retrieve person images from +a large gallery that match the given textual descriptions. The main challenge +of this task lies in the significant differences in information representation +between the visual and textual modalities. The textual modality conveys +abstract and precise information through vocabulary and grammatical structures, +while the visual modality conveys concrete and intuitive information through +images. To fully leverage the expressive power of textual representations, it +is essential to accurately map abstract textual descriptions to specific +images. + To address this issue, we propose a novel framework to Unleash the +Imagination of Text (UIT) in text-to-image person retrieval, aiming to fully +explore the power of words in sentences. Specifically, the framework employs +the pre-trained full CLIP model as a dual encoder for the images and texts , +taking advantage of prior cross-modal alignment knowledge. The Text-guided +Image Restoration auxiliary task is proposed with the aim of implicitly mapping +abstract textual entities to specific image regions, facilitating alignment +between textual and visual embeddings. Additionally, we introduce a cross-modal +triplet loss tailored for handling hard samples, enhancing the model's ability +to distinguish minor differences. + To focus the model on the key components within sentences, we propose a novel +text data augmentation technique. Our proposed methods achieve state-of-the-art +results on three popular benchmark datasets, and the source code will be made +publicly available shortly. + +
+
+
+
+
+ + ☆ Outlier-Robust Tensor Low-Rank Representation for Data Clustering + + +
+ Low-rank tensor analysis has received widespread attention with many +practical applications. However, the tensor data are often contaminated by +outliers or sample-specific corruptions. How to recover the tensor data that +are corrupted by outliers and perform data clustering remains a challenging +problem. This paper develops an outlier-robust tensor low-rank representation +(OR-TLRR) method for simultaneous outlier detection and tensor data clustering +based on the tensor singular value decomposition (t-SVD) algebraic framework. +It is motivated by the recently proposed tensor-tensor product induced by +invertible linear transforms that satisfy certain conditions. For tensor +observations with arbitrary outlier corruptions, OR-TLRR has provable +performance guarantee for exactly recovering the row space of clean data and +detecting outliers under mild conditions. Moreover, an extension of OR-TLRR is +also proposed to handle the case when parts of the data are missing. Finally, +extensive experimental results on both synthetic and real data demonstrate the +effectiveness of the proposed algorithms. + +
+
+ comment: 12 pages, 1 figure; preprint of a journal paper +
+
+
+
+
+ + ☆ Connections between Operator-splitting Methods and Deep Neural Networks + with Applications in Image Segmentation + + +
+ Deep neural network is a powerful tool for many tasks. Understanding why it +is so successful and providing a mathematical explanation is an important +problem and has been one popular research direction in past years. In the +literature of mathematical analysis of deep deep neural networks, a lot of +works are dedicated to establishing representation theories. How to make +connections between deep neural networks and mathematical algorithms is still +under development. In this paper, we give an algorithmic explanation for deep +neural networks, especially in their connection with operator splitting and +multigrid methods. We show that with certain splitting strategies, +operator-splitting methods have the same structure as networks. Utilizing this +connection and the Potts model for image segmentation, two networks inspired by +operator-splitting methods are proposed. The two networks are essentially two +operator-splitting algorithms solving the Potts model. Numerical experiments +are presented to demonstrate the effectiveness of the proposed networks. + +
+
+
+
+
+ + ☆ R-Cut: Enhancing Explainability in Vision Transformers with Relationship + Weighted Out and Cut + + +
+ Transformer-based models have gained popularity in the field of natural +language processing (NLP) and are extensively utilized in computer vision tasks +and multi-modal models such as GPT4. This paper presents a novel method to +enhance the explainability of Transformer-based image classification models. +Our method aims to improve trust in classification results and empower users to +gain a deeper understanding of the model for downstream tasks by providing +visualizations of class-specific maps. We introduce two modules: the +``Relationship Weighted Out" and the ``Cut" modules. The ``Relationship +Weighted Out" module focuses on extracting class-specific information from +intermediate layers, enabling us to highlight relevant features. Additionally, +the ``Cut" module performs fine-grained feature decomposition, taking into +account factors such as position, texture, and color. By integrating these +modules, we generate dense class-specific visual explainability maps. We +validate our method with extensive qualitative and quantitative experiments on +the ImageNet dataset. Furthermore, we conduct a large number of experiments on +the LRN dataset, specifically designed for automatic driving danger alerts, to +evaluate the explainability of our method in complex backgrounds. The results +demonstrate a significant improvement over previous methods. Moreover, we +conduct ablation experiments to validate the effectiveness of each module. +Through these experiments, we are able to confirm the respective contributions +of each module, thus solidifying the overall effectiveness of our proposed +approach. + +
+
+
+
+
+ + ☆ PottsMGNet: A Mathematical Explanation of Encoder-Decoder Based Neural + Networks + + +
+ For problems in image processing and many other fields, a large class of +effective neural networks has encoder-decoder-based architectures. Although +these networks have made impressive performances, mathematical explanations of +their architectures are still underdeveloped. In this paper, we study the +encoder-decoder-based network architecture from the algorithmic perspective and +provide a mathematical explanation. We use the two-phase Potts model for image +segmentation as an example for our explanations. We associate the segmentation +problem with a control problem in the continuous setting. Then, multigrid +method and operator splitting scheme, the PottsMGNet, are used to discretize +the continuous control model. We show that the resulting discrete PottsMGNet is +equivalent to an encoder-decoder-based network. With minor modifications, it is +shown that a number of the popular encoder-decoder-based neural networks are +just instances of the proposed PottsMGNet. By incorporating the +Soft-Threshold-Dynamics into the PottsMGNet as a regularizer, the PottsMGNet +has shown to be robust with the network parameters such as network width and +depth and achieved remarkable performance on datasets with very large noise. In +nearly all our experiments, the new network always performs better or as good +on accuracy and dice score than existing networks for image segmentation. + +
+
+
+
+
+ + ☆ Online Self-Supervised Thermal Water Segmentation for Aerial Vehicles + + +
+ We present a new method to adapt an RGB-trained water segmentation network to +target-domain aerial thermal imagery using online self-supervision by +leveraging texture and motion cues as supervisory signals. This new thermal +capability enables current autonomous aerial robots operating in near-shore +environments to perform tasks such as visual navigation, bathymetry, and flow +tracking at night. Our method overcomes the problem of scarce and +difficult-to-obtain near-shore thermal data that prevents the application of +conventional supervised and unsupervised methods. In this work, we curate the +first aerial thermal near-shore dataset, show that our approach outperforms +fully-supervised segmentation models trained on limited target-domain thermal +data, and demonstrate real-time capabilities onboard an Nvidia Jetson embedded +computing platform. Code and datasets used in this work will be available at: +https://github.com/connorlee77/uav-thermal-water-segmentation. + +
+
+ comment: 8 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ ActionPrompt: Action-Guided 3D Human Pose Estimation With Text and Pose + Prompting ICME + + +
+ Recent 2D-to-3D human pose estimation (HPE) utilizes temporal consistency +across sequences to alleviate the depth ambiguity problem but ignore the action +related prior knowledge hidden in the pose sequence. In this paper, we propose +a plug-and-play module named Action Prompt Module (APM) that effectively mines +different kinds of action clues for 3D HPE. The highlight is that, the mining +scheme of APM can be widely adapted to different frameworks and bring +consistent benefits. Specifically, we first present a novel Action-related Text +Prompt module (ATP) that directly embeds action labels and transfers the rich +language information in the label to the pose sequence. Besides, we further +introduce Action-specific Pose Prompt module (APP) to mine the position-aware +pose pattern of each action, and exploit the correlation between the mined +patterns and input pose sequence for further pose refinement. Experiments show +that APM can improve the performance of most video-based 2D-to-3D HPE +frameworks by a large margin. + +
+
+ comment: 6 pages, 4 figures, 2023ICME +
+
+
+
+
+ + ☆ LA-Net: Landmark-Aware Learning for Reliable Facial Expression + Recognition under Label Noise ICCV 2023 + + +
+ Facial expression recognition (FER) remains a challenging task due to the +ambiguity of expressions. The derived noisy labels significantly harm the +performance in real-world scenarios. To address this issue, we present a new +FER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks +to mitigate the impact of label noise from two perspectives. Firstly, LA-Net +uses landmark information to suppress the uncertainty in expression space and +constructs the label distribution of each sample by neighborhood aggregation, +which in turn improves the quality of training supervision. Secondly, the model +incorporates landmark information into expression representations using the +devised expression-landmark contrastive loss. The enhanced expression feature +extractor can be less susceptible to label noise. Our method can be integrated +with any deep neural network for better training supervision without +introducing extra inference costs. We conduct extensive experiments on both +in-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net +achieves state-of-the-art performance. + +
+
+ comment: accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Face-PAST: Facial Pose Awareness and Style Transfer Networks + + +
+ Facial style transfer has been quite popular among researchers due to the +rise of emerging technologies such as eXtended Reality (XR), Metaverse, and +Non-Fungible Tokens (NFTs). Furthermore, StyleGAN methods along with +transfer-learning strategies have reduced the problem of limited data to some +extent. However, most of the StyleGAN methods overfit the styles while adding +artifacts to facial images. In this paper, we propose a facial pose awareness +and style transfer (Face-PAST) network that preserves facial details and +structures while generating high-quality stylized images. Dual StyleGAN +inspires our work, but in contrast, our work uses a pre-trained style +generation network in an external style pass with a residual modulation block +instead of a transform coding block. Furthermore, we use the gated mapping unit +and facial structure, identity, and segmentation losses to preserve the facial +structure and details. This enables us to train the network with a very limited +amount of data while generating high-quality stylized images. Our training +process adapts curriculum learning strategy to perform efficient and flexible +style mixing in the generative space. We perform extensive experiments to show +the superiority of Face-PAST in comparison to existing state-of-the-art +methods. + +
+
+ comment: 20 pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ U-shaped Transformer: Retain High Frequency Context in Time Series + Analysis + + +
+ Time series prediction plays a crucial role in various industrial fields. In +recent years, neural networks with a transformer backbone have achieved +remarkable success in many domains, including computer vision and NLP. In time +series analysis domain, some studies have suggested that even the simplest MLP +networks outperform advanced transformer-based networks on time series forecast +tasks. However, we believe these findings indicate there to be low-rank +properties in time series sequences. In this paper, we consider the low-pass +characteristics of transformers and try to incorporate the advantages of MLP. +We adopt skip-layer connections inspired by Unet into traditional transformer +backbone, thus preserving high-frequency context from input to output, namely +U-shaped Transformer. We introduce patch merge and split operation to extract +features with different scales and use larger datasets to fully make use of the +transformer backbone. Our experiments demonstrate that the model performs at an +advanced level across multiple datasets with relatively low cost. + +
+
+
+
+
+ + ☆ Soft-IntroVAE for Continuous Latent space Image Super-Resolution + + +
+ Continuous image super-resolution (SR) recently receives a lot of attention +from researchers, for its practical and flexible image scaling for various +displays. Local implicit image representation is one of the methods that can +map the coordinates and 2D features for latent space interpolation. Inspired by +Variational AutoEncoder, we propose a Soft-introVAE for continuous latent space +image super-resolution (SVAE-SR). A novel latent space adversarial training is +achieved for photo-realistic image restoration. To further improve the quality, +a positional encoding scheme is used to extend the original pixel coordinates +by aggregating frequency information over the pixel areas. We show the +effectiveness of the proposed SVAE-SR through quantitative and qualitative +comparisons, and further, illustrate its generalization in denoising and +real-image super-resolution. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Frequency-mixed Single-source Domain Generalization for Medical Image + Segmentation + + +
+ The annotation scarcity of medical image segmentation poses challenges in +collecting sufficient training data for deep learning models. Specifically, +models trained on limited data may not generalize well to other unseen data +domains, resulting in a domain shift issue. Consequently, domain generalization +(DG) is developed to boost the performance of segmentation models on unseen +domains. However, the DG setup requires multiple source domains, which impedes +the efficient deployment of segmentation algorithms in clinical scenarios. To +address this challenge and improve the segmentation model's generalizability, +we propose a novel approach called the Frequency-mixed Single-source Domain +Generalization method (FreeSDG). By analyzing the frequency's effect on domain +discrepancy, FreeSDG leverages a mixed frequency spectrum to augment the +single-source domain. Additionally, self-supervision is constructed in the +domain augmentation to learn robust context-aware representations for the +segmentation task. Experimental results on five datasets of three modalities +demonstrate the effectiveness of the proposed algorithm. FreeSDG outperforms +state-of-the-art methods and significantly improves the segmentation model's +generalizability. Therefore, FreeSDG provides a promising solution for +enhancing the generalization of medical image segmentation models, especially +when annotated data is scarce. The code is available at +https://github.com/liamheng/Non-IID_Medical_Image_Segmentation. + +
+
+
+
+
+ + ☆ Ord2Seq: Regard Ordinal Regression as Label Sequence Prediction ICCV2023 + + +
+ Ordinal regression refers to classifying object instances into ordinal +categories. It has been widely studied in many scenarios, such as medical +disease grading, movie rating, etc. Known methods focused only on learning +inter-class ordinal relationships, but still incur limitations in +distinguishing adjacent categories thus far. In this paper, we propose a simple +sequence prediction framework for ordinal regression called Ord2Seq, which, for +the first time, transforms each ordinal category label into a special label +sequence and thus regards an ordinal regression task as a sequence prediction +process. In this way, we decompose an ordinal regression task into a series of +recursive binary classification steps, so as to subtly distinguish adjacent +categories. Comprehensive experiments show the effectiveness of distinguishing +adjacent categories for performance improvement and our new approach exceeds +state-of-the-art performances in four different scenarios. Codes will be +available upon acceptance. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ TractCloud: Registration-free tractography parcellation with a novel + local-global streamline point cloud representation MICCAI 2023 + + +
+ Diffusion MRI tractography parcellation classifies streamlines into +anatomical fiber tracts to enable quantification and visualization for clinical +and scientific applications. Current tractography parcellation methods rely +heavily on registration, but registration inaccuracies can affect parcellation +and the computational cost of registration is high for large-scale datasets. +Recently, deep-learning-based methods have been proposed for tractography +parcellation using various types of representations for streamlines. However, +these methods only focus on the information from a single streamline, ignoring +geometric relationships between the streamlines in the brain. We propose +TractCloud, a registration-free framework that performs whole-brain +tractography parcellation directly in individual subject space. We propose a +novel, learnable, local-global streamline representation that leverages +information from neighboring and whole-brain streamlines to describe the local +anatomy and global pose of the brain. We train our framework on a large-scale +labeled tractography dataset, which we augment by applying synthetic transforms +including rotation, scaling, and translations. We test our framework on five +independently acquired datasets across populations and health conditions. +TractCloud significantly outperforms several state-of-the-art methods on all +testing datasets. TractCloud achieves efficient and consistent whole-brain +white matter parcellation across the lifespan (from neonates to elderly +subjects, including brain tumor patients) without the need for registration. +The robustness and high inference speed of TractCloud make it suitable for +large-scale tractography data analysis. Our project page is available at +https://tractcloud.github.io/. + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ☆ Towards Authentic Face Restoration with Iterative Diffusion Models and + Beyond ICCV 2023 + + +
+ An authentic face restoration system is becoming increasingly demanding in +many computer vision applications, e.g., image enhancement, video +communication, and taking portrait. Most of the advanced face restoration +models can recover high-quality faces from low-quality ones but usually fail to +faithfully generate realistic and high-frequency details that are favored by +users. To achieve authentic restoration, we propose $\textbf{IDM}$, an +$\textbf{I}$teratively learned face restoration system based on denoising +$\textbf{D}$iffusion $\textbf{M}$odels (DDMs). We define the criterion of an +authentic face restoration system, and argue that denoising diffusion models +are naturally endowed with this property from two aspects: intrinsic iterative +refinement and extrinsic iterative enhancement. Intrinsic learning can preserve +the content well and gradually refine the high-quality details, while extrinsic +enhancement helps clean the data and improve the restoration task one step +further. We demonstrate superior performance on blind face restoration tasks. +Beyond restoration, we find the authentically cleaned data by the proposed +restoration system is also helpful to image generation tasks in terms of +training stabilization and sample quality. Without modifying the models, we +achieve better quality than state-of-the-art on FFHQ and ImageNet generation +using either GANs or diffusion models. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Revisiting Latent Space of GAN Inversion for Real Image Editing + + +
+ The exploration of the latent space in StyleGANs and GAN inversion exemplify +impressive real-world image editing, yet the trade-off between reconstruction +quality and editing quality remains an open problem. In this study, we revisit +StyleGANs' hyperspherical prior $\mathcal{Z}$ and combine it with highly +capable latent spaces to build combined spaces that faithfully invert real +images while maintaining the quality of edited images. More specifically, we +propose $\mathcal{F}/\mathcal{Z}^{+}$ space consisting of two subspaces: +$\mathcal{F}$ space of an intermediate feature map of StyleGANs enabling +faithful reconstruction and $\mathcal{Z}^{+}$ space of an extended StyleGAN +prior supporting high editing quality. We project the real images into the +proposed space to obtain the inverted codes, by which we then move along +$\mathcal{Z}^{+}$, enabling semantic editing without sacrificing image quality. +Comprehensive experiments show that $\mathcal{Z}^{+}$ can replace the most +commonly-used $\mathcal{W}$, $\mathcal{W}^{+}$, and $\mathcal{S}$ spaces while +preserving reconstruction quality, resulting in reduced distortion of edited +images. + +
+
+ comment: 10 pages, 12 figures. arXiv admin note: substantial text overlap with + arXiv:2306.00241 +
+
+
+
+
+ + ☆ Human Action Recognition in Still Images Using ConViT + + +
+ Understanding the relationship between different parts of the image plays a +crucial role in many visual recognition tasks. Despite the fact that +Convolutional Neural Networks (CNNs) have demonstrated impressive results in +detecting single objects, they lack the capability to extract the relationship +between various regions of an image, which is a crucial factor in human action +recognition. To address this problem, this paper proposes a new module that +functions like a convolutional layer using Vision Transformer (ViT). The +proposed action recognition model comprises two components: the first part is a +deep convolutional network that extracts high-level spatial features from the +image, and the second component of the model utilizes a Vision Transformer that +extracts the relationship between various regions of the image using the +feature map generated by the CNN output. The proposed model has been evaluated +on the Stanford40 and PASCAL VOC 2012 action datasets and has achieved 95.5% +mAP and 91.5% mAP results, respectively, which are promising compared to other +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Arbitrary point cloud upsampling via Dual Back-Projection Network + + +
+ Point clouds acquired from 3D sensors are usually sparse and noisy. Point +cloud upsampling is an approach to increase the density of the point cloud so +that detailed geometric information can be restored. In this paper, we propose +a Dual Back-Projection network for point cloud upsampling (DBPnet). A Dual +Back-Projection is formulated in an up-down-up manner for point cloud +upsampling. It not only back projects feature residues but also coordinates +residues so that the network better captures the point correlations in the +feature and space domains, achieving lower reconstruction errors on both +uniform and non-uniform sparse point clouds. Our proposed method is also +generalizable for arbitrary upsampling tasks (e.g. 4x, 5.5x). Experimental +results show that the proposed method achieves the lowest point set matching +losses with respect to the benchmark. In addition, the success of our approach +demonstrates that generative networks are not necessarily needed for +non-uniform point clouds. + +
+
+ comment: 5 pages, 5 figures +
+
+
+
+
+ + ☆ EgoVM: Achieving Precise Ego-Localization using Lightweight Vectorized + Maps + + +
+ Accurate and reliable ego-localization is critical for autonomous driving. In +this paper, we present EgoVM, an end-to-end localization network that achieves +comparable localization accuracy to prior state-of-the-art methods, but uses +lightweight vectorized maps instead of heavy point-based maps. To begin with, +we extract BEV features from online multi-view images and LiDAR point cloud. +Then, we employ a set of learnable semantic embeddings to encode the semantic +types of map elements and supervise them with semantic segmentation, to make +their feature representation consistent with BEV features. After that, we feed +map queries, composed of learnable semantic embeddings and coordinates of map +elements, into a transformer decoder to perform cross-modality matching with +BEV features. Finally, we adopt a robust histogram-based pose solver to +estimate the optimal pose by searching exhaustively over candidate poses. We +comprehensively validate the effectiveness of our method using both the +nuScenes dataset and a newly collected dataset. The experimental results show +that our method achieves centimeter-level localization accuracy, and +outperforms existing methods using vectorized maps by a large margin. +Furthermore, our model has been extensively tested in a large fleet of +autonomous vehicles under various challenging urban scenes. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ EVIL: Evidential Inference Learning for Trustworthy Semi-supervised + Medical Image Segmentation + + +
+ Recently, uncertainty-aware methods have attracted increasing attention in +semi-supervised medical image segmentation. However, current methods usually +suffer from the drawback that it is difficult to balance the computational +cost, estimation accuracy, and theoretical support in a unified framework. To +alleviate this problem, we introduce the Dempster-Shafer Theory of Evidence +(DST) into semi-supervised medical image segmentation, dubbed Evidential +Inference Learning (EVIL). EVIL provides a theoretically guaranteed solution to +infer accurate uncertainty quantification in a single forward pass. Trustworthy +pseudo labels on unlabeled data are generated after uncertainty estimation. The +recently proposed consistency regularization-based training paradigm is adopted +in our framework, which enforces the consistency on the perturbed predictions +to enhance the generalization with few labeled data. Experimental results show +that EVIL achieves competitive performance in comparison with several +state-of-the-art methods on the public dataset. + +
+
+
+
+
+ + ☆ In Defense of Clip-based Video Relation Detection + + +
+ Video Visual Relation Detection (VidVRD) aims to detect visual relationship +triplets in videos using spatial bounding boxes and temporal boundaries. +Existing VidVRD methods can be broadly categorized into bottom-up and top-down +paradigms, depending on their approach to classifying relations. Bottom-up +methods follow a clip-based approach where they classify relations of short +clip tubelet pairs and then merge them into long video relations. On the other +hand, top-down methods directly classify long video tubelet pairs. While recent +video-based methods utilizing video tubelets have shown promising results, we +argue that the effective modeling of spatial and temporal context plays a more +significant role than the choice between clip tubelets and video tubelets. This +motivates us to revisit the clip-based paradigm and explore the key success +factors in VidVRD. In this paper, we propose a Hierarchical Context Model (HCM) +that enriches the object-based spatial context and relation-based temporal +context based on clips. We demonstrate that using clip tubelets can achieve +superior performance compared to most video-based methods. Additionally, using +clip tubelets offers more flexibility in model designs and helps alleviate the +limitations associated with video tubelets, such as the challenging long-term +object tracking problem and the loss of temporal information in long-term +tubelet feature compression. Extensive experiments conducted on two challenging +VidVRD benchmarks validate that our HCM achieves a new state-of-the-art +performance, highlighting the effectiveness of incorporating advanced spatial +and temporal context modeling within the clip-based paradigm. + +
+
+
+
+
+ + ☆ Learned Scalable Video Coding For Humans and Machines + + +
+ Video coding has traditionally been developed to support services such as +video streaming, videoconferencing, digital TV, and so on. The main intent was +to enable human viewing of the encoded content. However, with the advances in +deep neural networks (DNNs), encoded video is increasingly being used for +automatic video analytics performed by machines. In applications such as +automatic traffic monitoring, analytics such as vehicle detection, tracking and +counting, would run continuously, while human viewing could be required +occasionally to review potential incidents. To support such applications, a new +paradigm for video coding is needed that will facilitate efficient +representation and compression of video for both machine and human use in a +scalable manner. In this manuscript, we introduce the first end-to-end +learnable video codec that supports a machine vision task in its base layer, +while its enhancement layer supports input reconstruction for human viewing. +The proposed system is constructed based on the concept of conditional coding +to achieve better compression gains. Comprehensive experimental evaluations +conducted on four standard video datasets demonstrate that our framework +outperforms both state-of-the-art learned and conventional video codecs in its +base layer, while maintaining comparable performance on the human vision task +in its enhancement layer. We will provide the implementation of the proposed +system at www.github.com upon completion of the review process. + +
+
+ comment: 14 pages, 16 figures +
+
+
+
+
+ + ☆ Deep Physics-Guided Unrolling Generalization for Compressed Sensing + + +
+ By absorbing the merits of both the model- and data-driven methods, deep +physics-engaged learning scheme achieves high-accuracy and interpretable image +reconstruction. It has attracted growing attention and become the mainstream +for inverse imaging tasks. Focusing on the image compressed sensing (CS) +problem, we find the intrinsic defect of this emerging paradigm, widely +implemented by deep algorithm-unrolled networks, in which more plain iterations +involving real physics will bring enormous computation cost and long inference +time, hindering their practical application. A novel deep +$\textbf{P}$hysics-guided un$\textbf{R}$olled recovery $\textbf{L}$earning +($\textbf{PRL}$) framework is proposed by generalizing the traditional +iterative recovery model from image domain (ID) to the high-dimensional feature +domain (FD). A compact multiscale unrolling architecture is then developed to +enhance the network capacity and keep real-time inference speeds. Taking two +different perspectives of optimization and range-nullspace decomposition, +instead of building an algorithm-specific unrolled network, we provide two +implementations: $\textbf{PRL-PGD}$ and $\textbf{PRL-RND}$. Experiments exhibit +the significant performance and efficiency leading of PRL networks over other +state-of-the-art methods with a large potential for further improvement and +real application to other inverse imaging problems or optimization models. + +
+
+ comment: Accepted by International Journal of Computer Vision (IJCV) 2023 +
+
+
+
+
+ + ☆ Experimental Security Analysis of DNN-based Adaptive Cruise Control + under Context-Aware Perception Attacks + + +
+ Adaptive Cruise Control (ACC) is a widely used driver assistance feature for +maintaining desired speed and safe distance to the leading vehicles. This paper +evaluates the security of the deep neural network (DNN) based ACC systems under +stealthy perception attacks that strategically inject perturbations into camera +data to cause forward collisions. We present a combined +knowledge-and-data-driven approach to design a context-aware strategy for the +selection of the most critical times for triggering the attacks and a novel +optimization-based method for the adaptive generation of image perturbations at +run-time. We evaluate the effectiveness of the proposed attack using an actual +driving dataset and a realistic simulation platform with the control software +from a production ACC system and a physical-world driving simulator while +considering interventions by the driver and safety features such as Automatic +Emergency Braking (AEB) and Forward Collision Warning (FCW). Experimental +results show that the proposed attack achieves 142.9x higher success rate in +causing accidents than random attacks and is mitigated 89.6% less by the safety +features while being stealthy and robust to real-world factors and dynamic +changes in the environment. This study provides insights into the role of human +operators and basic safety interventions in preventing attacks. + +
+
+ comment: 18 pages, 14 figures, 8 tables +
+
+
+
+
+ + ☆ Unsupervised Deep Graph Matching Based on Cycle Consistency ICCV 2023 + + +
+ We contribute to the sparsely populated area of unsupervised deep graph +matching with application to keypoint matching in images. Contrary to the +standard \emph{supervised} approach, our method does not require ground truth +correspondences between keypoint pairs. Instead, it is self-supervised by +enforcing consistency of matchings between images of the same object category. +As the matching and the consistency loss are discrete, their derivatives cannot +be straightforwardly used for learning. We address this issue in a principled +way by building our method upon the recent results on black-box differentiation +of combinatorial solvers. This makes our method exceptionally flexible, as it +is compatible with arbitrary network architectures and combinatorial solvers. +Our experimental evaluation suggests that our technique sets a new +state-of-the-art for unsupervised graph matching. + +
+
+ comment: 12 pages, 5 figures, 3 papers. ICCV 2023 reject +
+
+
+
+
+ + ☆ Learning to Sample Tasks for Meta Learning + + +
+ Through experiments on various meta-learning methods, task samplers, and +few-shot learning tasks, this paper arrives at three conclusions. Firstly, +there are no universal task sampling strategies to guarantee the performance of +meta-learning models. Secondly, task diversity can cause the models to either +underfit or overfit during training. Lastly, the generalization performance of +the models are influenced by task divergence, task entropy, and task +difficulty. In response to these findings, we propose a novel task sampler +called Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes +task divergence, task entropy, and task difficulty to sample tasks. To optimize +ASr, we rethink and propose a simple and general meta-learning algorithm. +Finally, a large number of empirical experiments demonstrate the effectiveness +of the proposed ASr. + +
+
+ comment: 10 pages, 7 tables, 3 figures +
+
+
+
+
+ + ☆ Accuracy versus time frontiers of semi-supervised and self-supervised + learning on medical images + + +
+ For many applications of classifiers to medical images, a trustworthy label +for each image can be difficult or expensive to obtain. In contrast, images +without labels are more readily available. Two major research directions both +promise that additional unlabeled data can improve classifier performance: +self-supervised learning pretrains useful representations on unlabeled data +only, then fine-tunes a classifier on these representations via the labeled +set; semi-supervised learning directly trains a classifier on labeled and +unlabeled data simultaneously. Recent methods from both directions have claimed +significant gains on non-medical tasks, but do not systematically assess +medical images and mostly compare only to methods in the same direction. This +study contributes a carefully-designed benchmark to help answer a +practitioner's key question: given a small labeled dataset and a limited budget +of hours to spend on training, what gains from additional unlabeled images are +possible and which methods best achieve them? Unlike previous benchmarks, ours +uses realistic-sized validation sets to select hyperparameters, assesses +runtime-performance tradeoffs, and bridges two research fields. By comparing 6 +semi-supervised methods and 5 self-supervised methods to strong labeled-only +baselines on 3 medical datasets with 30-1000 labels per class, we offer +insights to resource-constrained, results-focused practitioners: MixMatch, +SimCLR, and BYOL represent strong choices that were not surpassed by more +recent methods. After much effort selecting hyperparameters on one dataset, we +publish settings that enable strong methods to perform well on new medical +tasks within a few hours, with further search over dozens of hours delivering +modest additional gains. + +
+
+ comment: Semi-supervised Learning; Self-supervised Learning; Medical Imaging +
+
+
+
+
+ + ☆ Towards the Sparseness of Projection Head in Self-Supervised Learning + + +
+ In recent years, self-supervised learning (SSL) has emerged as a promising +approach for extracting valuable representations from unlabeled data. One +successful SSL method is contrastive learning, which aims to bring positive +examples closer while pushing negative examples apart. Many current contrastive +learning approaches utilize a parameterized projection head. Through a +combination of empirical analysis and theoretical investigation, we provide +insights into the internal mechanisms of the projection head and its +relationship with the phenomenon of dimensional collapse. Our findings +demonstrate that the projection head enhances the quality of representations by +performing contrastive loss in a projected subspace. Therefore, we propose an +assumption that only a subset of features is necessary when minimizing the +contrastive loss of a mini-batch of data. Theoretical analysis further suggests +that a sparse projection head can enhance generalization, leading us to +introduce SparseHead - a regularization term that effectively constrains the +sparsity of the projection head, and can be seamlessly integrated with any +self-supervised learning (SSL) approaches. Our experimental results validate +the effectiveness of SparseHead, demonstrating its ability to improve the +performance of existing contrastive methods. + +
+
+ comment: 9 pages,3 figures +
+
+
+
+
+ + ☆ What Can Simple Arithmetic Operations Do for Temporal Modeling? ICCV 2023 + + +
+ Temporal modeling plays a crucial role in understanding video content. To +tackle this problem, previous studies built complicated temporal relations +through time sequence thanks to the development of computationally powerful +devices. In this work, we explore the potential of four simple arithmetic +operations for temporal modeling. Specifically, we first capture auxiliary +temporal cues by computing addition, subtraction, multiplication, and division +between pairs of extracted frame features. Then, we extract corresponding +features from these cues to benefit the original temporal-irrespective domain. +We term such a simple pipeline as an Arithmetic Temporal Module (ATM), which +operates on the stem of a visual backbone with a plug-andplay style. We conduct +comprehensive ablation studies on the instantiation of ATMs and demonstrate +that this module provides powerful temporal modeling capability at a low +computational cost. Moreover, the ATM is compatible with both CNNs- and +ViTs-based architectures. Our results show that ATM achieves superior +performance over several popular video benchmarks. Specifically, on +Something-Something V1, V2 and Kinetics-400, we reach top-1 accuracy of 65.6%, +74.6%, and 89.4% respectively. The code is available at +https://github.com/whwu95/ATM. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Domain Adaptation for Enhanced Object Detection in Foggy and Rainy + Weather for Autonomous Driving + + +
+ Most object detection models for autonomous driving may experience a +significant drop in performance when deployed in real-world applications, due +to the well-known domain shift issue. Supervised object detection methods for +autonomous driving usually assume a consistent feature distribution between +training and testing data, however, such assumptions may not always be the case +when weather conditions differ significantly. For example, an object detection +model trained under clear weather may not perform well in foggy or rainy +weather, due to the domain gap. Overcoming detection bottlenecks in foggy or +rainy weather scenarios is a significant challenge for autonomous vehicles +deployed in the wild. To address the domain gap in different weather +conditions, This paper proposes a novel domain adaptive object detection +framework for autonomous driving in foggy and rainy weather. Our method +leverages both image-level and object-level adaptation to reduce the domain +discrepancy in image style and object appearance. Additionally, to enhance the +model's performance under challenging samples, we introduce a new adversarial +gradient reversal layer that performs adversarial mining on hard examples +alongside domain adaptation. Moreover, we propose to generate an auxiliary +domain by data augmentation to enforce a new domain-level metric +regularization. Experimental results on public benchmarks demonstrate that +object detection performance is significantly improved when using our proposed +method in domain shift scenarios for autonomous driving applications. + +
+
+
+
+
+ + ☆ Object-aware Gaze Target Detection ICCV 2023 + + +
+ Gaze target detection aims to predict the image location where the person is +looking and the probability that a gaze is out of the scene. Several works have +tackled this task by regressing a gaze heatmap centered on the gaze location, +however, they overlooked decoding the relationship between the people and the +gazed objects. This paper proposes a Transformer-based architecture that +automatically detects objects (including heads) in the scene to build +associations between every head and the gazed-head/object, resulting in a +comprehensive, explainable gaze analysis composed of: gaze target area, gaze +pixel point, the class and the image location of the gazed-object. Upon +evaluation of the in-the-wild benchmarks, our method achieves state-of-the-art +results on all metrics (up to 2.91% gain in AUC, 50% reduction in gaze +distance, and 9% gain in out-of-frame average precision) for gaze target +detection and 11-13% improvement in average precision for the classification +and the localization of the gazed-objects. The code of the proposed method is +available https://github.com/francescotonini/object-aware-gaze-target-detection + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Skin Lesion Correspondence Localization in Total Body Photography MICCAI-2023 + + +
+ Longitudinal tracking of skin lesions - finding correspondence, changes in +morphology, and texture - is beneficial to the early detection of melanoma. +However, it has not been well investigated in the context of full-body imaging. +We propose a novel framework combining geometric and texture information to +localize skin lesion correspondence from a source scan to a target scan in +total body photography (TBP). Body landmarks or sparse correspondence are first +created on the source and target 3D textured meshes. Every vertex on each of +the meshes is then mapped to a feature vector characterizing the geodesic +distances to the landmarks on that mesh. Then, for each lesion of interest +(LOI) on the source, its corresponding location on the target is first coarsely +estimated using the geometric information encoded in the feature vectors and +then refined using the texture information. We evaluated the framework +quantitatively on both a public and a private dataset, for which our success +rates (at 10 mm criterion) are comparable to the only reported longitudinal +study. As full-body 3D capture becomes more prevalent and has higher quality, +we expect the proposed method to constitute a valuable step in the longitudinal +tracking of skin lesions. + +
+
+ comment: MICCAI-2023 +
+
+
+
+
+ + ☆ Traffic-Domain Video Question Answering with Automatic Captioning SC2023 + + +
+ Video Question Answering (VidQA) exhibits remarkable potential in +facilitating advanced machine reasoning capabilities within the domains of +Intelligent Traffic Monitoring and Intelligent Transportation Systems. +Nevertheless, the integration of urban traffic scene knowledge into VidQA +systems has received limited attention in previous research endeavors. In this +work, we present a novel approach termed Traffic-domain Video Question +Answering with Automatic Captioning (TRIVIA), which serves as a +weak-supervision technique for infusing traffic-domain knowledge into large +video-language models. Empirical findings obtained from the SUTD-TrafficQA task +highlight the substantial enhancements achieved by TRIVIA, elevating the +accuracy of representative video-language models by a remarkable 6.5 points +(19.88%) compared to baseline settings. This pioneering methodology holds great +promise for driving advancements in the field, inspiring researchers and +practitioners alike to unlock the full potential of emerging video-language +models in traffic-related applications. + +
+
+ comment: Accepted in ITSC2023 +
+
+
+
+
+ + ☆ Conditional 360-degree Image Synthesis for Immersive Indoor Scene + Decoration ICCV2023 + + +
+ In this paper, we address the problem of conditional scene decoration for +360-degree images. Our method takes a 360-degree background photograph of an +indoor scene and generates decorated images of the same scene in the panorama +view. To do this, we develop a 360-aware object layout generator that learns +latent object vectors in the 360-degree view to enable a variety of furniture +arrangements for an input 360-degree background image. We use this object +layout to condition a generative adversarial network to synthesize images of an +input scene. To further reinforce the generation capability of our model, we +develop a simple yet effective scene emptier that removes the generated +furniture and produces an emptied scene for our model to learn a cyclic +constraint. We train the model on the Structure3D dataset and show that our +model can generate diverse decorations with controllable object layout. Our +method achieves state-of-the-art performance on the Structure3D dataset and +generalizes well to the Zillow indoor scene dataset. Our user study confirms +the immersive experiences provided by the realistic image quality and furniture +layout in our generation results. Our implementation will be made available. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ Gradient strikes back: How filtering out high frequencies improves + explanations + + +
+ Recent years have witnessed an explosion in the development of novel +prediction-based attribution methods, which have slowly been supplanting older +gradient-based methods to explain the decisions of deep neural networks. +However, it is still not clear why prediction-based methods outperform +gradient-based ones. Here, we start with an empirical observation: these two +approaches yield attribution maps with very different power spectra, with +gradient-based methods revealing more high-frequency content than +prediction-based methods. This observation raises multiple questions: What is +the source of this high-frequency information, and does it truly reflect +decisions made by the system? Lastly, why would the absence of high-frequency +information in prediction-based methods yield better explainability scores +along multiple metrics? We analyze the gradient of three representative visual +classification models and observe that it contains noisy information emanating +from high-frequencies. Furthermore, our analysis reveals that the operations +used in Convolutional Neural Networks (CNNs) for downsampling appear to be a +significant source of this high-frequency content -- suggesting aliasing as a +possible underlying basis. We then apply an optimal low-pass filter for +attribution maps and demonstrate that it improves gradient-based attribution +methods. We show that (i) removing high-frequency noise yields significant +improvements in the explainability scores obtained with gradient-based methods +across multiple models -- leading to (ii) a novel ranking of state-of-the-art +methods with gradient-based methods at the top. We believe that our results +will spur renewed interest in simpler and computationally more efficient +gradient-based methods for explainability. + +
+
+
+
+
+ + ☆ Automating Wood Species Detection and Classification in Microscopic + Images of Fibrous Materials with Deep Learning + + +
+ We have developed a methodology for the systematic generation of a large +image dataset of macerated wood references, which we used to generate image +data for nine hardwood genera. This is the basis for a substantial approach to +automate, for the first time, the identification of hardwood species in +microscopic images of fibrous materials by deep learning. Our methodology +includes a flexible pipeline for easy annotation of vessel elements. We compare +the performance of different neural network architectures and hyperparameters. +Our proposed method performs similarly well to human experts. In the future, +this will improve controls on global wood fiber product flows to protect +forests. + +
+
+
+
+
+ + ☆ Surgical Action Triplet Detection by Mixed Supervised Learning of + Instrument-Tissue Interactions MICCAI + + +
+ Surgical action triplets describe instrument-tissue interactions as +(instrument, verb, target) combinations, thereby supporting a detailed analysis +of surgical scene activities and workflow. This work focuses on surgical action +triplet detection, which is challenging but more precise than the traditional +triplet recognition task as it consists of joint (1) localization of surgical +instruments and (2) recognition of the surgical action triplet associated with +every localized instrument. Triplet detection is highly complex due to the lack +of spatial triplet annotation. We analyze how the amount of instrument spatial +annotations affects triplet detection and observe that accurate instrument +localization does not guarantee better triplet detection due to the risk of +erroneous associations with the verbs and targets. To solve the two tasks, we +propose MCIT-IG, a two-stage network, that stands for Multi-Class +Instrument-aware Transformer-Interaction Graph. The MCIT stage of our network +models per class embedding of the targets as additional features to reduce the +risk of misassociating triplets. Furthermore, the IG stage constructs a +bipartite dynamic graph to model the interaction between the instruments and +targets, cast as the verbs. We utilize a mixed-supervised learning strategy +that combines weak target presence labels for MCIT and pseudo triplet labels +for IG to train our network. We observed that complementing minimal instrument +spatial annotations with target embeddings results in better triplet detection. +We evaluate our model on the CholecT50 dataset and show improved performance on +both instrument localization and triplet detection, topping the leaderboard of +the CholecTriplet challenge in MICCAI 2022. + +
+
+ comment: Accepted at MICCAI, 2023. Project Page: + https://github.com/CAMMA-public/mcit-ig +
+
+
+
+
+ + ☆ Can Neural Network Memorization Be Localized? ICML 2023 + + +
+ Recent efforts at explaining the interplay of memorization and generalization +in deep overparametrized networks have posited that neural networks +$\textit{memorize}$ "hard" examples in the final few layers of the model. +Memorization refers to the ability to correctly predict on $\textit{atypical}$ +examples of the training set. In this work, we show that rather than being +confined to individual layers, memorization is a phenomenon confined to a small +set of neurons in various layers of the model. First, via three experimental +sources of converging evidence, we find that most layers are redundant for the +memorization of examples and the layers that contribute to example memorization +are, in general, not the final layers. The three sources are $\textit{gradient +accounting}$ (measuring the contribution to the gradient norms from memorized +and clean examples), $\textit{layer rewinding}$ (replacing specific model +weights of a converged model with previous training checkpoints), and +$\textit{retraining}$ (training rewound layers only on clean examples). Second, +we ask a more generic question: can memorization be localized +$\textit{anywhere}$ in a model? We discover that memorization is often confined +to a small number of neurons or channels (around 5) of the model. Based on +these insights we propose a new form of dropout -- $\textit{example-tied +dropout}$ that enables us to direct the memorization of examples to an apriori +determined set of neurons. By dropping out these neurons, we are able to reduce +the accuracy on memorized examples from $100\%\to3\%$, while also reducing the +generalization gap. + +
+
+ comment: Accepted at ICML 2023 +
+
+
+
+
+ + ☆ Adversarial Bayesian Augmentation for Single-Source Domain + Generalization + + +
+ Generalizing to unseen image domains is a challenging problem primarily due +to the lack of diverse training data, inaccessible target data, and the large +domain shift that may exist in many real-world settings. As such data +augmentation is a critical component of domain generalization methods that seek +to address this problem. We present Adversarial Bayesian Augmentation (ABA), a +novel algorithm that learns to generate image augmentations in the challenging +single-source domain generalization setting. ABA draws on the strengths of +adversarial learning and Bayesian neural networks to guide the generation of +diverse data augmentations -- these synthesized image domains aid the +classifier in generalizing to unseen domains. We demonstrate the strength of +ABA on several types of domain shift including style shift, subpopulation +shift, and shift in the medical imaging setting. ABA outperforms all previous +state-of-the-art methods, including pre-specified augmentations, pixel-based +and convolutional-based augmentations. + +
+
+
+
+
+ + ♻ ☆ Robustness Analysis of Video-Language Models Against Visual and Language + Perturbations NeurIPS 2022 + + +
+ Joint visual and language modeling on large-scale datasets has recently shown +good progress in multi-modal tasks when compared to single modal learning. +However, robustness of these approaches against real-world perturbations has +not been studied. In this work, we perform the first extensive robustness study +of video-language models against various real-world perturbations. We focus on +text-to-video retrieval and propose two large-scale benchmark datasets, +MSRVTT-P and YouCook2-P, which utilize 90 different visual and 35 different +text perturbations. The study reveals some interesting initial findings from +the studied models: 1) models are generally more susceptible when only video is +perturbed as opposed to when only text is perturbed, 2) models that are +pre-trained are more robust than those trained from scratch, 3) models attend +more to scene and objects rather than motion and action. We hope this study +will serve as a benchmark and guide future research in robust video-language +learning. The benchmark introduced in this study along with the code and +datasets is available at https://bit.ly/3CNOly4. + +
+
+ comment: NeurIPS 2022 Datasets and Benchmarks Track. This projects webpage is + located at https://bit.ly/3CNOly4 +
+
+
+
+
+ + ♻ ☆ Synchronous Image-Label Diffusion Probability Model with Application to + Stroke Lesion Segmentation on Non-contrast CT + + +
+ Stroke lesion volume is a key radiologic measurement for assessing the +prognosis of Acute Ischemic Stroke (AIS) patients, which is challenging to be +automatically measured on Non-Contrast CT (NCCT) scans. Recent diffusion +probabilistic models have shown potentials of being used for image +segmentation. In this paper, a novel Synchronous image-label Diffusion +Probability Model (SDPM) is proposed for stroke lesion segmentation on NCCT +using Markov diffusion process. The proposed SDPM is fully based on a Latent +Variable Model (LVM), offering a complete probabilistic elaboration. An +additional net-stream, parallel with a noise prediction stream, is introduced +to obtain initial noisy label estimates for efficiently inferring the final +labels. By optimizing the specified variational boundaries, the trained model +can infer multiple label estimates for reference given the input images with +noises. The proposed model was assessed on three stroke lesion datasets +including one public and two private datasets. Compared to several U-net and +transformer-based segmentation methods, our proposed SDPM model is able to +achieve state-of-the-art performance. The code is publicly available. + +
+
+
+
+
+ + ♻ ☆ Ultra-Fast and Ultra-Low-Power In-Sensor Edge Vision for Gaze Estimation + + +
+ Intelligent edge vision tasks encounter the critical challenge of ensuring +power and latency efficiency due to the typically heavy computational load they +impose on edge platforms.This work leverages one of the first "AI in sensor" +vision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power +end-to-end edge vision applications. We evaluate the IMX500 and compare it to +other edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by +exploring gaze estimation as a case study. We propose TinyTracker, a highly +efficient, fully quantized model for 2D gaze estimation designed to maximize +the performance of the edge vision systems considered in this study. +TinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1] +without significant loss in gaze estimation accuracy (maximum of 0.16 cm when +fully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor +results in end-to-end latency of around 19ms. The camera takes around 17.9ms to +read, process and transmit the pixels to the accelerator. The inference time of +the network is 0.86ms with an additional 0.24 ms for retrieving the results +from the sensor. The overall energy consumption of the end-to-end system is 4.9 +mJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is +1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ +VS 34.2mJ) + +
+
+
+
+
+ + ♻ ☆ Mitigating Transformer Overconfidence via Lipschitz Regularization UAI 2023 + + +
+ Though Transformers have achieved promising results in many computer vision +tasks, they tend to be over-confident in predictions, as the standard Dot +Product Self-Attention (DPSA) can barely preserve distance for the unbounded +input domain. In this work, we fill this gap by proposing a novel Lipschitz +Regularized Transformer (LRFormer). Specifically, we present a new similarity +function with the distance within Banach Space to ensure the Lipschitzness and +also regularize the term by a contractive Lipschitz Bound. The proposed method +is analyzed with a theoretical guarantee, providing a rigorous basis for its +effectiveness and reliability. Extensive experiments conducted on standard +vision benchmarks demonstrate that our method outperforms the state-of-the-art +single forward pass approaches in prediction, calibration, and uncertainty +estimation. + +
+
+ comment: Accepted by UAI 2023. (https://proceedings.mlr.press/v216/ye23a.html) +
+
+
+
+
+ + ♻ ☆ HopFIR: Hop-wise GraphFormer with Intragroup Joint Refinement for 3D + Human Pose Estimation + + +
+ 2D-to-3D human pose lifting is fundamental for 3D human pose estimation +(HPE). Graph Convolutional Network (GCN) has been proven inherently suitable to +model the human skeletal topology. However, current GCN-based 3D HPE methods +update the node features by aggregating their neighbors' information without +considering the interaction of joints in different motion patterns. Although +some studies import limb information to learn the movement patterns, the latent +synergies among joints, such as maintaining balance in the motion are seldom +investigated. We propose a hop-wise GraphFormer with intragroup joint +refinement (HopFIR) to tackle the 3D HPE problem. The HopFIR mainly consists of +a novel Hop-wise GraphFormer(HGF) module and an Intragroup Joint +Refinement(IJR) module which leverages the prior limb information for +peripheral joints refinement. The HGF module groups the joints by $k$-hop +neighbors and utilizes a hop-wise transformer-like attention mechanism among +these groups to discover latent joint synergy. Extensive experimental results +show that HopFIR outperforms the SOTA methods with a large margin (on the +Human3.6M dataset, the mean per joint position error (MPJPE) is 32.67mm). +Furthermore, it is also demonstrated that previous SOTA GCN-based methods can +benefit from the proposed hop-wise attention mechanism efficiently with +significant performance promotion, such as SemGCN and MGCN are improved by 8.9% +and 4.5%, respectively. + +
+
+ comment: we will re-upload the newest version soon +
+
+
+
+
+ + ♻ ☆ Untargeted Near-collision Attacks in Biometric Recognition + + +
+ A biometric recognition system can operate in two distinct modes, +identification or verification. In the first mode, the system recognizes an +individual by searching the enrolled templates of all the users for a match. In +the second mode, the system validates a user's identity claim by comparing the +fresh provided template with the enrolled template. The biometric +transformation schemes usually produce binary templates that are better handled +by cryptographic schemes, and the comparison is based on a distance that leaks +information about the similarities between two biometric templates. Both the +experimentally determined false match rate and false non-match rate through +recognition threshold adjustment define the recognition accuracy, and hence the +security of the system. To the best of our knowledge, few works provide a +formal treatment of the security under minimum leakage of information, i.e., +the binary outcome of a comparison with a threshold. In this paper, we rely on +probabilistic modelling to quantify the security strength of binary templates. +We investigate the influence of template size, database size and threshold on +the probability of having a near-collision. We highlight several untargeted +attacks on biometric systems considering naive and adaptive adversaries. +Interestingly, these attacks can be launched both online and offline and, both +in the identification mode and in the verification mode. We discuss the choice +of parameters through the generic presented attacks. + +
+
+ comment: Addition of results and correction of typos +
+
+
+
+
+ + ♻ ☆ Multi-class point cloud completion networks for 3D cardiac anatomy + reconstruction from cine magnetic resonance images + + +
+ Cine magnetic resonance imaging (MRI) is the current gold standard for the +assessment of cardiac anatomy and function. However, it typically only acquires +a set of two-dimensional (2D) slices of the underlying three-dimensional (3D) +anatomy of the heart, thus limiting the understanding and analysis of both +healthy and pathological cardiac morphology and physiology. In this paper, we +propose a novel fully automatic surface reconstruction pipeline capable of +reconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI +acquisitions. Its key component is a multi-class point cloud completion network +(PCCN) capable of correcting both the sparsity and misalignment issues of the +3D reconstruction task in a unified model. We first evaluate the PCCN on a +large synthetic dataset of biventricular anatomies and observe Chamfer +distances between reconstructed and gold standard anatomies below or similar to +the underlying image resolution for multiple levels of slice misalignment. +Furthermore, we find a reduction in reconstruction error compared to a +benchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean +surface distance, respectively. We then apply the PCCN as part of our automated +reconstruction pipeline to 1000 subjects from the UK Biobank study in a +cross-domain transfer setting and demonstrate its ability to reconstruct +accurate and topologically plausible biventricular heart meshes with clinical +metrics comparable to the previous literature. Finally, we investigate the +robustness of our proposed approach and observe its capacity to successfully +handle multiple common outlier conditions. + +
+
+
+
+
+ + ♻ ☆ SLCA: Slow Learner with Classifier Alignment for Continual Learning on a + Pre-trained Model ICCV 2023 + + +
+ The goal of continual learning is to improve the performance of recognition +models in learning sequentially arrived data. Although most existing works are +established on the premise of learning from scratch, growing efforts have been +devoted to incorporating the benefits of pre-training. However, how to +adaptively exploit the pre-trained knowledge for each incremental task while +maintaining its generalizability remains an open question. In this work, we +present an extensive analysis for continual learning on a pre-trained model +(CLPM), and attribute the key challenge to a progressive overfitting problem. +Observing that selectively reducing the learning rate can almost resolve this +issue in the representation layer, we propose a simple but extremely effective +approach named Slow Learner with Classifier Alignment (SLCA), which further +improves the classification layer by modeling the class-wise distributions and +aligning the classification layers in a post-hoc fashion. Across a variety of +scenarios, our proposal provides substantial improvements for CLPM (e.g., up to +49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split +CUB-200 and Split Cars-196, respectively), and thus outperforms +state-of-the-art approaches by a large margin. Based on such a strong baseline, +critical factors and promising directions are analyzed in-depth to facilitate +subsequent research. + +
+
+ comment: 11 pages, 8 figures, accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Meta-Polyp: a baseline for efficient Polyp segmentation + + +
+ In recent years, polyp segmentation has gained significant importance, and +many methods have been developed using CNN, Vision Transformer, and Transformer +techniques to achieve competitive results. However, these methods often face +difficulties when dealing with out-of-distribution datasets, missing +boundaries, and small polyps. In 2022, Meta-Former was introduced as a new +baseline for vision, which not only improved the performance of multi-task +computer vision but also addressed the limitations of the Vision Transformer +and CNN family backbones. To further enhance segmentation, we propose a fusion +of Meta-Former with UNet, along with the introduction of a Multi-scale +Upsampling block with a level-up combination in the decoder stage to enhance +the texture, also we propose the Convformer block base on the idea of the +Meta-former to enhance the crucial information of the local feature. These +blocks enable the combination of global information, such as the overall shape +of the polyp, with local information and boundary information, which is crucial +for the decision of the medical segmentation. Our proposed approach achieved +competitive performance and obtained the top result in the State of the Art on +the CVC-300 dataset, Kvasir, and CVC-ColonDB dataset. Apart from Kvasir-SEG, +others are out-of-distribution datasets. The implementation can be found at: +https://github.com/huyquoctrinh/MetaPolyp-CBMS2023. + +
+
+
+
+
+ + ♻ ☆ Deep Multiview Clustering by Contrasting Cluster Assignments + + +
+ Multiview clustering (MVC) aims to reveal the underlying structure of +multiview data by categorizing data samples into clusters. Deep learning-based +methods exhibit strong feature learning capabilities on large-scale datasets. +For most existing deep MVC methods, exploring the invariant representations of +multiple views is still an intractable problem. In this paper, we propose a +cross-view contrastive learning (CVCL) method that learns view-invariant +representations and produces clustering results by contrasting the cluster +assignments among multiple views. Specifically, we first employ deep +autoencoders to extract view-dependent features in the pretraining stage. Then, +a cluster-level CVCL strategy is presented to explore consistent semantic label +information among the multiple views in the fine-tuning stage. Thus, the +proposed CVCL method is able to produce more discriminative cluster assignments +by virtue of this learning strategy. Moreover, we provide a theoretical +analysis of soft cluster assignment alignment. Extensive experimental results +obtained on several datasets demonstrate that the proposed CVCL method +outperforms several state-of-the-art approaches. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Universal Domain Adaptation via Compressive Attention Matching + + +
+ Universal domain adaptation (UniDA) aims to transfer knowledge from the +source domain to the target domain without any prior knowledge about the label +set. The challenge lies in how to determine whether the target samples belong +to common categories. The mainstream methods make judgments based on the sample +features, which overemphasizes global information while ignoring the most +crucial local objects in the image, resulting in limited accuracy. To address +this issue, we propose a Universal Attention Matching (UniAM) framework by +exploiting the self-attention mechanism in vision transformer to capture the +crucial object information. The proposed framework introduces a novel +Compressive Attention Matching (CAM) approach to explore the core information +by compressively representing attentions. Furthermore, CAM incorporates a +residual-based measurement to determine the sample commonness. By utilizing the +measurement, UniAM achieves domain-wise and category-wise Common Feature +Alignment (CFA) and Target Class Separation (TCS). Notably, UniAM is the first +method utilizing the attention in vision transformer directly to perform +classification tasks. Extensive experiments show that UniAM outperforms the +current state-of-the-art methods on various benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ Bidirectionally Deformable Motion Modulation For Video-based Human Pose + Transfer ICCV 2023 + + +
+ Video-based human pose transfer is a video-to-video generation task that +animates a plain source human image based on a series of target human poses. +Considering the difficulties in transferring highly structural patterns on the +garments and discontinuous poses, existing methods often generate +unsatisfactory results such as distorted textures and flickering artifacts. To +address these issues, we propose a novel Deformable Motion Modulation (DMM) +that utilizes geometric kernel offset with adaptive weight modulation to +simultaneously perform feature alignment and style transfer. Different from +normal style modulation used in style transfer, the proposed modulation +mechanism adaptively reconstructs smoothed frames from style codes according to +the object shape through an irregular receptive field of view. To enhance the +spatio-temporal consistency, we leverage bidirectional propagation to extract +the hidden motion information from a warped image sequence generated by noisy +poses. The proposed feature propagation significantly enhances the motion +prediction ability by forward and backward propagation. Both quantitative and +qualitative experimental results demonstrate superiority over the +state-of-the-arts in terms of image fidelity and visual continuity. The source +code is publicly available at github.com/rocketappslab/bdmm. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Geometric Ultrasound Localization Microscopy MICCAI 2023 + + +
+ Contrast-Enhanced Ultra-Sound (CEUS) has become a viable method for +non-invasive, dynamic visualization in medical diagnostics, yet Ultrasound +Localization Microscopy (ULM) has enabled a revolutionary breakthrough by +offering ten times higher resolution. To date, Delay-And-Sum (DAS) beamformers +are used to render ULM frames, ultimately determining the image resolution +capability. To take full advantage of ULM, this study questions whether +beamforming is the most effective processing step for ULM, suggesting an +alternative approach that relies solely on Time-Difference-of-Arrival (TDoA) +information. To this end, a novel geometric framework for micro bubble +localization via ellipse intersections is proposed to overcome existing +beamforming limitations. We present a benchmark comparison based on a public +dataset for which our geometric ULM outperforms existing baseline methods in +terms of accuracy and robustness while only utilizing a portion of the +available transducer data. + +
+
+ comment: Pre-print accepted for MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Multimodal Distillation for Egocentric Action Recognition ICCV 2023 + + +
+ The focal point of egocentric video understanding is modelling hand-object +interactions. Standard models, e.g. CNNs or Vision Transformers, which receive +RGB frames as input perform well. However, their performance improves further +by employing additional input modalities that provide complementary cues, such +as object detections, optical flow, audio, etc. The added complexity of the +modality-specific modules, on the other hand, makes these models impractical +for deployment. The goal of this work is to retain the performance of such a +multimodal approach, while using only the RGB frames as input at inference +time. We demonstrate that for egocentric action recognition on the +Epic-Kitchens and the Something-Something datasets, students which are taught +by multimodal teachers tend to be more accurate and better calibrated than +architecturally equivalent models trained on ground truth labels in a unimodal +or multimodal fashion. We further adopt a principled multimodal knowledge +distillation framework, allowing us to deal with issues which occur when +applying multimodal knowledge distillation in a naive manner. Lastly, we +demonstrate the achieved reduction in computational complexity, and show that +our approach maintains higher performance with the reduction of the number of +input views. We release our code at +https://github.com/gorjanradevski/multimodal-distillation. + +
+
+ comment: Accepted at ICCV 2023; Codebase released at + https://github.com/gorjanradevski/multimodal-distillation +
+
+
+
+
+ + ♻ ☆ CB-HVTNet: A channel-boosted hybrid vision transformer network for + lymphocyte assessment in histopathological images + + +
+ Transformers, due to their ability to learn long range dependencies, have +overcome the shortcomings of convolutional neural networks (CNNs) for global +perspective learning. Therefore, they have gained the focus of researchers for +several vision related tasks including medical diagnosis. However, their +multi-head attention module only captures global level feature representations, +which is insufficient for medical images. To address this issue, we propose a +Channel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning +to generate boosted channels and employs both transformers and CNNs to analyse +lymphocytes in histopathological images. The proposed CB HVT comprises five +modules, including a channel generation module, channel exploitation module, +channel merging module, region-aware module, and a detection and segmentation +head, which work together to effectively identify lymphocytes. The channel +generation module uses the idea of channel boosting through transfer learning +to extract diverse channels from different auxiliary learners. In the CB HVT, +these boosted channels are first concatenated and ranked using an attention +mechanism in the channel exploitation module. A fusion block is then utilized +in the channel merging module for a gradual and systematic merging of the +diverse boosted channels to improve the network's learning representations. The +CB HVT also employs a proposal network in its region aware module and a head to +effectively identify objects, even in overlapping regions and with artifacts. +We evaluated the proposed CB HVT on two publicly available datasets for +lymphocyte assessment in histopathological images. The results show that CB HVT +outperformed other state of the art detection models, and has good +generalization ability, demonstrating its value as a tool for pathologists. + +
+
+
+
+
+ + ♻ ☆ Generalizable Classification of UHF Partial Discharge Signals in + Gas-Insulated HVDC Systems Using Neural Networks + + +
+ Undetected partial discharges (PDs) are a safety critical issue in high +voltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC +voltage is well-established, the analysis of PDs under DC voltage remains an +active research field. A key focus of these investigations is the +classification of different PD sources to enable subsequent sophisticated +analysis. + In this paper, we propose and analyze a neural network-based approach for +classifying PD signals caused by metallic protrusions and conductive particles +on the insulator of HVDC GIS, without relying on pulse sequence analysis +features. In contrast to previous approaches, our proposed model can +discriminate the studied PD signals obtained at negative and positive +potentials, while also generalizing to unseen operating voltage multiples. +Additionally, we compare the performance of time- and frequency-domain input +signals and explore the impact of different normalization schemes to mitigate +the influence of free-space path loss between the sensor and defect location. + +
+
+ comment: 8 pages, submitted to IEEE Transactions on Power Delivery +
+
+
+
+
+ + ♻ ☆ Detecting Images Generated by Deep Diffusion Models using their Local + Intrinsic Dimensionality + + +
+ Diffusion models recently have been successfully applied for the visual +synthesis of strikingly realistic appearing images. This raises strong concerns +about their potential for malicious purposes. In this paper, we propose using +the lightweight multi Local Intrinsic Dimensionality (multiLID), which has been +originally developed in context of the detection of adversarial examples, for +the automatic detection of synthetic images and the identification of the +according generator networks. In contrast to many existing detection +approaches, which often only work for GAN-generated images, the proposed method +provides close to perfect detection results in many realistic use cases. +Extensive experiments on known and newly created datasets demonstrate that the +proposed multiLID approach exhibits superiority in diffusion detection and +model identification. Since the empirical evaluations of recent publications on +the detection of generated images are often mainly focused on the +"LSUN-Bedroom" dataset, we further establish a comprehensive benchmark for the +detection of diffusion-generated images, including samples from several +diffusion models with different image sizes. + +
+
+
+
+
+ + ♻ ☆ Mitigating Adversarial Vulnerability through Causal Parameter Estimation + by Adversarial Double Machine Learning ICCV 2023 + + +
+ Adversarial examples derived from deliberately crafted perturbations on +visual inputs can easily harm decision process of deep neural networks. To +prevent potential threats, various adversarial training-based defense methods +have grown rapidly and become a de facto standard approach for robustness. +Despite recent competitive achievements, we observe that adversarial +vulnerability varies across targets and certain vulnerabilities remain +prevalent. Intriguingly, such peculiar phenomenon cannot be relieved even with +deeper architectures and advanced defense methods. To address this issue, in +this paper, we introduce a causal approach called Adversarial Double Machine +Learning (ADML), which allows us to quantify the degree of adversarial +vulnerability for network predictions and capture the effect of treatments on +outcome of interests. ADML can directly estimate causal parameter of +adversarial perturbations per se and mitigate negative effects that can +potentially damage robustness, bridging a causal perspective into the +adversarial vulnerability. Through extensive experiments on various CNN and +Transformer architectures, we corroborate that ADML improves adversarial +robustness with large margins and relieve the empirical observation. + +
+
+ comment: Accepted in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ LLA-FLOW: A Lightweight Local Aggregation on Cost Volume for Optical + Flow Estimation + + +
+ Lack of texture often causes ambiguity in matching, and handling this issue +is an important challenge in optical flow estimation. Some methods insert +stacked transformer modules that allow the network to use global information of +cost volume for estimation. But the global information aggregation often incurs +serious memory and time costs during training and inference, which hinders +model deployment. We draw inspiration from the traditional local region +constraint and design the local similarity aggregation (LSA) and the shifted +local similarity aggregation (SLSA). The aggregation for cost volume is +implemented with lightweight modules that act on the feature maps. Experiments +on the final pass of Sintel show the lower cost required for our approach while +maintaining competitive performance. + +
+
+
+
+
+ + ♻ ☆ CLIP-Guided StyleGAN Inversion for Text-Driven Real Image Editing + + +
+ Researchers have recently begun exploring the use of StyleGAN-based models +for real image editing. One particularly interesting application is using +natural language descriptions to guide the editing process. Existing approaches +for editing images using language either resort to instance-level latent code +optimization or map predefined text prompts to some editing directions in the +latent space. However, these approaches have inherent limitations. The former +is not very efficient, while the latter often struggles to effectively handle +multi-attribute changes. To address these weaknesses, we present CLIPInverter, +a new text-driven image editing approach that is able to efficiently and +reliably perform multi-attribute changes. The core of our method is the use of +novel, lightweight text-conditioned adapter layers integrated into pretrained +GAN-inversion networks. We demonstrate that by conditioning the initial +inversion step on the CLIP embedding of the target description, we are able to +obtain more successful edit directions. Additionally, we use a CLIP-guided +refinement step to make corrections in the resulting residual latent codes, +which further improves the alignment with the text prompt. Our method +outperforms competing approaches in terms of manipulation accuracy and +photo-realism on various domains including human faces, cats, and birds, as +shown by our qualitative and quantitative results. + +
+
+ comment: Accepted for publication in ACM Transactions on Graphics +
+
+
+
+
+ + ♻ ☆ Boosting Zero-shot Learning via Contrastive Optimization of Attribute + Representations + + +
+ Zero-shot learning (ZSL) aims to recognize classes that do not have samples +in the training set. One representative solution is to directly learn an +embedding function associating visual features with corresponding class +semantics for recognizing new classes. Many methods extend upon this solution, +and recent ones are especially keen on extracting rich features from images, +e.g. attribute features. These attribute features are normally extracted within +each individual image; however, the common traits for features across images +yet belonging to the same attribute are not emphasized. In this paper, we +propose a new framework to boost ZSL by explicitly learning attribute +prototypes beyond images and contrastively optimizing them with attribute-level +features within images. Besides the novel architecture, two elements are +highlighted for attribute representations: a new prototype generation module is +designed to generate attribute prototypes from attribute semantics; a hard +example-based contrastive optimization scheme is introduced to reinforce +attribute-level features in the embedding space. We explore two alternative +backbones, CNN-based and transformer-based, to build our framework and conduct +experiments on three standard benchmarks, CUB, SUN, AwA2. Results on these +benchmarks demonstrate that our method improves the state of the art by a +considerable margin. Our codes will be available at +https://github.com/dyabel/CoAR-ZSL.git + +
+
+ comment: Accepted to TNNLS +
+
+
+
+
+ + ♻ ☆ Clustering Method for Time-Series Images Using Quantum-Inspired + Computing Technology + + +
+ Time-series clustering serves as a powerful data mining technique for +time-series data in the absence of prior knowledge about clusters. A large +amount of time-series data with large size has been acquired and used in +various research fields. Hence, clustering method with low computational cost +is required. Given that a quantum-inspired computing technology, such as a +simulated annealing machine, surpasses conventional computers in terms of fast +and accurately solving combinatorial optimization problems, it holds promise +for accomplishing clustering tasks that are challenging to achieve using +existing methods. This study proposes a novel time-series clustering method +that leverages an annealing machine. The proposed method facilitates an even +classification of time-series data into clusters close to each other while +maintaining robustness against outliers. Moreover, its applicability extends to +time-series images. We compared the proposed method with a standard existing +method for clustering an online distributed dataset. In the existing method, +the distances between each data are calculated based on the Euclidean distance +metric, and the clustering is performed using the k-means++ method. We found +that both methods yielded comparable results. Furthermore, the proposed method +was applied to a flow measurement image dataset containing noticeable noise +with a signal-to-noise ratio of approximately 1. Despite a small signal +variation of approximately 2%, the proposed method effectively classified the +data without any overlap among the clusters. In contrast, the clustering +results by the standard existing method and the conditional image sampling +(CIS) method, a specialized technique for flow measurement data, displayed +overlapping clusters. Consequently, the proposed method provides better results +than the other two methods, demonstrating its potential as a superior +clustering method. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Iterative Scale-Up ExpansionIoU and Deep Features Association for + Multi-Object Tracking in Sports + + +
+ Multi-object tracking algorithms have made significant advancements due to +the recent developments in object detection. However, most existing methods +primarily focus on tracking pedestrians or vehicles, which exhibit relatively +simple and regular motion patterns. Consequently, there is a scarcity of +algorithms that address the tracking of targets with irregular or non-linear +motion, such as multi-athlete tracking. Furthermore, popular tracking +algorithms often rely on the Kalman filter for object motion modeling, which +fails to track objects when their motion contradicts the linear motion +assumption of the Kalman filter. Due to this reason, we proposed a novel online +and robust multi-object tracking approach, named Iterative Scale-Up +ExpansionIoU and Deep Features for multi-object tracking. Unlike conventional +methods, we abandon the use of the Kalman filter and propose utilizing the +iterative scale-up expansion IoU. This approach achieves superior tracking +performance without requiring additional training data or adopting a more +robust detector, all while maintaining a lower computational cost compared to +other appearance-based methods. Our proposed method demonstrates remarkable +effectiveness in tracking irregular motion objects, achieving a score of 76.9% +in HOTA. It outperforms all state-of-the-art tracking algorithms on the +SportsMOT dataset, covering various kinds of sport scenarios. + +
+
+
+
+
+ + ♻ ☆ Improving Transformer-based Image Matching by Cascaded Capturing + Spatially Informative Keypoints ICCV2023 + + +
+ Learning robust local image feature matching is a fundamental low-level +vision task, which has been widely explored in the past few years. Recently, +detector-free local feature matchers based on transformers have shown promising +results, which largely outperform pure Convolutional Neural Network (CNN) based +ones. But correlations produced by transformer-based methods are spatially +limited to the center of source views' coarse patches, because of the costly +attention learning. In this work, we rethink this issue and find that such +matching formulation degrades pose estimation, especially for low-resolution +images. So we propose a transformer-based cascade matching model -- Cascade +feature Matching TRansformer (CasMTR), to efficiently learn dense feature +correlations, which allows us to choose more reliable matching pairs for the +relative pose estimation. Instead of re-training a new detector, we use a +simple yet effective Non-Maximum Suppression (NMS) post-process to filter +keypoints through the confidence map, and largely improve the matching +precision. CasMTR achieves state-of-the-art performance in indoor and outdoor +pose estimation as well as visual localization. Moreover, thorough ablations +show the efficacy of the proposed components and techniques. + +
+
+ comment: Accepted by ICCV2023, Codes will be released in + https://github.com/ewrfcas/CasMTR +
+
+
+
+
+ + ♻ ☆ Open-Vocabulary Affordance Detection in 3D Point Clouds IROS 2023 + + +
+ Affordance detection is a challenging problem with a wide variety of robotic +applications. Traditional affordance detection methods are limited to a +predefined set of affordance labels, hence potentially restricting the +adaptability of intelligent robots in complex and dynamic environments. In this +paper, we present the Open-Vocabulary Affordance Detection (OpenAD) method, +which is capable of detecting an unbounded number of affordances in 3D point +clouds. By simultaneously learning the affordance text and the point feature, +OpenAD successfully exploits the semantic relationships between affordances. +Therefore, our proposed method enables zero-shot detection and can be able to +detect previously unseen affordances without a single annotation example. +Intensive experimental results show that OpenAD works effectively on a wide +range of affordance detection setups and outperforms other baselines by a large +margin. Additionally, we demonstrate the practicality of the proposed OpenAD in +real-world robotic applications with a fast inference speed (~100ms). Our +project is available at https://openad2023.github.io. + +
+
+ comment: Accepted to The 2023 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2023) +
+
+
+
+
+ + ♻ ☆ Unify, Align and Refine: Multi-Level Semantic Alignment for Radiology + Report Generation + + +
+ Automatic radiology report generation has attracted enormous research +interest due to its practical value in reducing the workload of radiologists. +However, simultaneously establishing global correspondences between the image +(e.g., Chest X-ray) and its related report and local alignments between image +patches and keywords remains challenging. To this end, we propose an Unify, +Align and then Refine (UAR) approach to learn multi-level cross-modal +alignments and introduce three novel modules: Latent Space Unifier (LSU), +Cross-modal Representation Aligner (CRA) and Text-to-Image Refiner (TIR). +Specifically, LSU unifies multimodal data into discrete tokens, making it +flexible to learn common knowledge among modalities with a shared network. The +modality-agnostic CRA learns discriminative features via a set of orthonormal +basis and a dual-gate mechanism first and then globally aligns visual and +textual representations under a triplet contrastive loss. TIR boosts +token-level local alignment via calibrating text-to-image attention with a +learnable mask. Additionally, we design a two-stage training procedure to make +UAR gradually grasp cross-modal alignments at different levels, which imitates +radiologists' workflow: writing sentence by sentence first and then checking +word by word. Extensive experiments and analyses on IU-Xray and MIMIC-CXR +benchmark datasets demonstrate the superiority of our UAR against varied +state-of-the-art methods. + +
+
+ comment: 1)Reassessment of author contributions. 2)Try to solve the problem + that Google Scholar does not display the all authors +
+
+
+
+
+ + ♻ ☆ On the Robustness of Split Learning against Adversarial Attacks ECAI 2023 + + +
+ Split learning enables collaborative deep learning model training while +preserving data privacy and model security by avoiding direct sharing of raw +data and model details (i.e., sever and clients only hold partial sub-networks +and exchange intermediate computations). However, existing research has mainly +focused on examining its reliability for privacy protection, with little +investigation into model security. Specifically, by exploring full models, +attackers can launch adversarial attacks, and split learning can mitigate this +severe threat by only disclosing part of models to untrusted servers.This paper +aims to evaluate the robustness of split learning against adversarial attacks, +particularly in the most challenging setting where untrusted servers only have +access to the intermediate layers of the model.Existing adversarial attacks +mostly focus on the centralized setting instead of the collaborative setting, +thus, to better evaluate the robustness of split learning, we develop a +tailored attack called SPADV, which comprises two stages: 1) shadow model +training that addresses the issue of lacking part of the model and 2) local +adversarial attack that produces adversarial examples to evaluate.The first +stage only requires a few unlabeled non-IID data, and, in the second stage, +SPADV perturbs the intermediate output of natural samples to craft the +adversarial ones. The overall cost of the proposed attack process is relatively +low, yet the empirical attack effectiveness is significantly high, +demonstrating the surprising vulnerability of split learning to adversarial +attacks. + +
+
+ comment: accepted by ECAI 2023, camera-ready version +
+
+
+
+
+ + ♻ ☆ UOD: Universal One-shot Detection of Anatomical Landmarks MICCAI 2023 + + +
+ One-shot medical landmark detection gains much attention and achieves great +success for its label-efficient training process. However, existing one-shot +learning methods are highly specialized in a single domain and suffer domain +preference heavily in the situation of multi-domain unlabeled data. Moreover, +one-shot learning is not robust that it faces performance drop when annotating +a sub-optimal image. To tackle these issues, we resort to developing a +domain-adaptive one-shot landmark detection framework for handling multi-domain +medical images, named Universal One-shot Detection (UOD). UOD consists of two +stages and two corresponding universal models which are designed as +combinations of domain-specific modules and domain-shared modules. In the first +stage, a domain-adaptive convolution model is self-supervised learned to +generate pseudo landmark labels. In the second stage, we design a +domain-adaptive transformer to eliminate domain preference and build the global +context for multi-domain data. Even though only one annotated sample from each +domain is available for training, the domain-shared modules help UOD aggregate +all one-shot samples to detect more robust and accurate landmarks. We +investigated both qualitatively and quantitatively the proposed UOD on three +widely-used public X-ray datasets in different anatomical domains (i.e., head, +hand, chest) and obtained state-of-the-art performances in each domain. The +code is available at +https://github.com/heqin-zhu/UOD_universal_oneshot_detection. + +
+
+ comment: Eealy accepted by MICCAI 2023. 11pages, 4 figures, 2 tables. arXiv + admin note: text overlap with arXiv:2203.06433 +
+
+
+
+
+ + ♻ ☆ Hiding Visual Information via Obfuscating Adversarial Perturbations + + +
+ Growing leakage and misuse of visual information raise security and privacy +concerns, which promotes the development of information protection. Existing +adversarial perturbations-based methods mainly focus on the de-identification +against deep learning models. However, the inherent visual information of the +data has not been well protected. In this work, inspired by the Type-I +adversarial attack, we propose an adversarial visual information hiding method +to protect the visual privacy of data. Specifically, the method generates +obfuscating adversarial perturbations to obscure the visual information of the +data. Meanwhile, it maintains the hidden objectives to be correctly predicted +by models. In addition, our method does not modify the parameters of the +applied model, which makes it flexible for different scenarios. Experimental +results on the recognition and classification tasks demonstrate that the +proposed method can effectively hide visual information and hardly affect the +performances of models. The code is available in the supplementary material. + +
+
+
+
+
+ + ♻ ☆ S2R-ViT for Multi-Agent Cooperative Perception: Bridging the Gap from + Simulation to Reality + + +
+ Due to the lack of real multi-agent data and time-consuming of labeling, +existing multi-agent cooperative perception algorithms usually select the +simulated sensor data for training and validating. However, the perception +performance is degraded when these simulation-trained models are deployed to +the real world, due to the significant domain gap between the simulated and +real data. In this paper, we propose the first Simulation-to-Reality transfer +learning framework for multi-agent cooperative perception using a novel Vision +Transformer, named as S2R-ViT, which considers both the Implementation Gap and +Feature Gap between simulated and real data. We investigate the effects of +these two types of domain gaps and propose a novel uncertainty-aware vision +transformer to effectively relief the Implementation Gap and an agent-based +feature adaptation module with inter-agent and ego-agent discriminators to +reduce the Feature Gap. Our intensive experiments on the public multi-agent +cooperative perception datasets OPV2V and V2V4Real demonstrate that the +proposed S2R-ViT can effectively bridge the gap from simulation to reality and +outperform other methods significantly for point cloud-based 3D object +detection. + +
+
+ comment: correct the complie error in Fig.5 +
+
+
+
+
+ + ♻ ☆ FocalUNETR: A Focal Transformer for Boundary-aware Segmentation of CT + Images + + +
+ Computed Tomography (CT) based precise prostate segmentation for treatment +planning is challenging due to (1) the unclear boundary of the prostate derived +from CT's poor soft tissue contrast and (2) the limitation of convolutional +neural network-based models in capturing long-range global context. Here we +propose a novel focal transformer-based image segmentation architecture to +effectively and efficiently extract local visual features and global context +from CT images. Additionally, we design an auxiliary boundary-induced label +regression task coupled with the main prostate segmentation task to address the +unclear boundary issue in CT images. We demonstrate that this design +significantly improves the quality of the CT-based prostate segmentation task +over other competing methods, resulting in substantially improved performance, +i.e., higher Dice Similarity Coefficient, lower Hausdorff Distance, and Average +Symmetric Surface Distance, on both private and public CT image datasets. Our +code is available at this +\href{https://github.com/ChengyinLee/FocalUNETR.git}{link}. + +
+
+ comment: 13 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ ExFaceGAN: Exploring Identity Directions in GAN's Learned Latent Space + for Synthetic Identity Generation + + +
+ Deep generative models have recently presented impressive results in +generating realistic face images of random synthetic identities. + To generate multiple samples of a certain synthetic identity, previous works +proposed to disentangle the latent space of GANs by incorporating additional +supervision or regularization, enabling the manipulation of certain attributes. +Others proposed to disentangle specific factors in unconditional pretrained +GANs latent spaces to control their output, which also requires supervision by +attribute classifiers. Moreover, these attributes are entangled in GAN's latent +space, making it difficult to manipulate them without affecting the identity +information. We propose in this work a framework, ExFaceGAN, to disentangle +identity information in pretrained GANs latent spaces, enabling the generation +of multiple samples of any synthetic identity. Given a reference latent code of +any synthetic image and latent space of pretrained GAN, our ExFaceGAN learns an +identity directional boundary that disentangles the latent space into two +sub-spaces, with latent codes of samples that are either identity similar or +dissimilar to a reference image. By sampling from each side of the boundary, +our ExFaceGAN can generate multiple samples of synthetic identity without the +need for designing a dedicated architecture or supervision from attribute +classifiers. We demonstrate the generalizability and effectiveness of ExFaceGAN +by integrating it into learned latent spaces of three SOTA GAN approaches. As +an example of the practical benefit of our ExFaceGAN, we empirically prove that +data generated by ExFaceGAN can be successfully used to train face recognition +models (\url{https://github.com/fdbtrs/ExFaceGAN}). + +
+
+ comment: Accepted at IJCB 2023 +
+
+
+
+
+ + ♻ ☆ MARVEL: Raster Manga Vectorization via Primitive-wise Deep Reinforcement + Learning + + +
+ Manga is a fashionable Japanese-style comic form that is composed of +black-and-white strokes and is generally displayed as raster images on digital +devices. Typical mangas have simple textures, wide lines, and few color +gradients, which are vectorizable natures to enjoy the merits of vector +graphics, e.g., adaptive resolutions and small file sizes. In this paper, we +propose MARVEL (MAnga's Raster to VEctor Learning), a primitive-wise approach +for vectorizing raster mangas by Deep Reinforcement Learning (DRL). Unlike +previous learning-based methods which predict vector parameters for an entire +image, MARVEL introduces a new perspective that regards an entire manga as a +collection of basic primitives\textemdash stroke lines, and designs a DRL model +to decompose the target image into a primitive sequence for achieving accurate +vectorization. To improve vectorization accuracies and decrease file sizes, we +further propose a stroke accuracy reward to predict accurate stroke lines, and +a pruning mechanism to avoid generating erroneous and repeated strokes. +Extensive subjective and objective experiments show that our MARVEL can +generate impressive results and reaches the state-of-the-art level. Our code is +open-source at: https://github.com/SwordHolderSH/Mang2Vec. + +
+
+ comment: The name of the previous version paper was: Mang2Vec: Vectorization + of raster manga by deep reinforcement learning +
+
+
+
+
+ + ♻ ☆ CIPER: Combining Invariant and Equivariant Representations Using + Contrastive and Predictive Learning + + +
+ Self-supervised representation learning (SSRL) methods have shown great +success in computer vision. In recent studies, augmentation-based contrastive +learning methods have been proposed for learning representations that are +invariant or equivariant to pre-defined data augmentation operations. However, +invariant or equivariant features favor only specific downstream tasks +depending on the augmentations chosen. They may result in poor performance when +the learned representation does not match task requirements. Here, we consider +an active observer that can manipulate views of an object and has knowledge of +the action(s) that generated each view. We introduce Contrastive Invariant and +Predictive Equivariant Representation learning (CIPER). CIPER comprises both +invariant and equivariant learning objectives using one shared encoder and two +different output heads on top of the encoder. One output head is a projection +head with a state-of-the-art contrastive objective to encourage invariance to +augmentations. The other is a prediction head estimating the augmentation +parameters, capturing equivariant features. Both heads are discarded after +training and only the encoder is used for downstream tasks. We evaluate our +method on static image tasks and time-augmented image datasets. Our results +show that CIPER outperforms a baseline contrastive method on various tasks. +Interestingly, CIPER encourages the formation of hierarchically structured +representations where different views of an object become systematically +organized in the latent representation space. + +
+
+ comment: 12 pages, 4 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Emergent Asymmetry of Precision and Recall for Measuring Fidelity and + Diversity of Generative Models in High Dimensions ICML 2023 + + +
+ Precision and Recall are two prominent metrics of generative performance, +which were proposed to separately measure the fidelity and diversity of +generative models. Given their central role in comparing and improving +generative models, understanding their limitations are crucially important. To +that end, in this work, we identify a critical flaw in the common approximation +of these metrics using k-nearest-neighbors, namely, that the very +interpretations of fidelity and diversity that are assigned to Precision and +Recall can fail in high dimensions, resulting in very misleading conclusions. +Specifically, we empirically and theoretically show that as the number of +dimensions grows, two model distributions with supports at equal point-wise +distance from the support of the real distribution, can have vastly different +Precision and Recall regardless of their respective distributions, hence an +emergent asymmetry in high dimensions. Based on our theoretical insights, we +then provide simple yet effective modifications to these metrics to construct +symmetric metrics regardless of the number of dimensions. Finally, we provide +experiments on real-world datasets to illustrate that the identified flaw is +not merely a pathological case, and that our proposed metrics are effective in +alleviating its impact. + +
+
+ comment: To appear in ICML 2023. Updated proof in Appendix B +
+
+
+
+
+ + ♻ ☆ What You Say Is What You Show: Visual Narration Detection in + Instructional Videos + + +
+ Narrated ''how-to'' videos have emerged as a promising data source for a wide +range of learning problems, from learning visual representations to training +robot policies. However, this data is extremely noisy, as the narrations do not +always describe the actions demonstrated in the video. To address this problem +we introduce the novel task of visual narration detection, which entails +determining whether a narration is visually depicted by the actions in the +video. We propose What You Say is What You Show (WYS^2), a method that +leverages multi-modal cues and pseudo-labeling to learn to detect visual +narrations with only weakly labeled data. Our model successfully detects visual +narrations in in-the-wild videos, outperforming strong baselines, and we +demonstrate its impact for state-of-the-art summarization and temporal +alignment of instructional videos. + +
+
+ comment: Technical Report +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ Deep Neural Aggregation for Recommending Items to Group of Users + + +
+ Modern society devotes a significant amount of time to digital interaction. +Many of our daily actions are carried out through digital means. This has led +to the emergence of numerous Artificial Intelligence tools that assist us in +various aspects of our lives. One key tool for the digital society is +Recommender Systems, intelligent systems that learn from our past actions to +propose new ones that align with our interests. Some of these systems have +specialized in learning from the behavior of user groups to make +recommendations to a group of individuals who want to perform a joint task. In +this article, we analyze the current state of Group Recommender Systems and +propose two new models that use emerging Deep Learning architectures. +Experimental results demonstrate the improvement achieved by employing the +proposed models compared to the state-of-the-art models using four different +datasets. The source code of the models, as well as that of all the experiments +conducted, is available in a public repository. + +
+
+
+
+
+ + ☆ Zero-shot Query Reformulation for Conversational Search ICTIR 2023 + + +
+ As the popularity of voice assistants continues to surge, conversational +search has gained increased attention in Information Retrieval. However, data +sparsity issues in conversational search significantly hinder the progress of +supervised conversational search methods. Consequently, researchers are +focusing more on zero-shot conversational search approaches. Nevertheless, +existing zero-shot methods face three primary limitations: they are not +universally applicable to all retrievers, their effectiveness lacks sufficient +explainability, and they struggle to resolve common conversational ambiguities +caused by omission. To address these limitations, we introduce a novel +Zero-shot Query Reformulation (ZeQR) framework that reformulates queries based +on previous dialogue contexts without requiring supervision from conversational +search data. Specifically, our framework utilizes language models designed for +machine reading comprehension tasks to explicitly resolve two common +ambiguities: coreference and omission, in raw queries. In comparison to +existing zero-shot methods, our approach is universally applicable to any +retriever without additional adaptation or indexing. It also provides greater +explainability and effectively enhances query intent understanding because +ambiguities are explicitly and proactively resolved. Through extensive +experiments on four TREC conversational datasets, we demonstrate the +effectiveness of our method, which consistently outperforms state-of-the-art +baselines. + +
+
+ comment: Accepted by ICTIR 2023 +
+
+
+
+
+ + ☆ ESMC: Entire Space Multi-Task Model for Post-Click Conversion Rate via + Parameter Constraint + + +
+ Large-scale online recommender system spreads all over the Internet being in +charge of two basic tasks: Click-Through Rate (CTR) and Post-Click Conversion +Rate (CVR) estimations. However, traditional CVR estimators suffer from +well-known Sample Selection Bias and Data Sparsity issues. Entire space models +were proposed to address the two issues via tracing the decision-making path of +"exposure_click_purchase". Further, some researchers observed that there are +purchase-related behaviors between click and purchase, which can better draw +the user's decision-making intention and improve the recommendation +performance. Thus, the decision-making path has been extended to +"exposure_click_in-shop action_purchase" and can be modeled with conditional +probability approach. Nevertheless, we observe that the chain rule of +conditional probability does not always hold. We report Probability Space +Confusion (PSC) issue and give a derivation of difference between ground-truth +and estimation mathematically. We propose a novel Entire Space Multi-Task Model +for Post-Click Conversion Rate via Parameter Constraint (ESMC) and two +alternatives: Entire Space Multi-Task Model with Siamese Network (ESMS) and +Entire Space Multi-Task Model in Global Domain (ESMG) to address the PSC issue. +Specifically, we handle "exposure_click_in-shop action" and "in-shop +action_purchase" separately in the light of characteristics of in-shop action. +The first path is still treated with conditional probability while the second +one is treated with parameter constraint strategy. Experiments on both offline +and online environments in a large-scale recommendation system illustrate the +superiority of our proposed methods over state-of-the-art models. The +real-world datasets will be released. + +
+
+
+
+
+ + ☆ Jean-Luc Picard at Touché 2023: Comparing Image Generation, Stance + Detection and Feature Matching for Image Retrieval for Arguments + + +
+ Participating in the shared task "Image Retrieval for arguments", we used +different pipelines for image retrieval containing Image Generation, Stance +Detection, Preselection and Feature Matching. We submitted four different runs +with different pipeline layout and compare them to given baseline. Our +pipelines perform similarly to the baseline. + +
+
+ comment: 7 pages, 1 figure, 1 table, conference: CLEF +
+
+
+
+
+ + ☆ Modeling Orders of User Behaviors via Differentiable Sorting: A + Multi-task Framework to Predicting User Post-click Conversion SIGIR 2023 + + +
+ User post-click conversion prediction is of high interest to researchers and +developers. Recent studies employ multi-task learning to tackle the selection +bias and data sparsity problem, two severe challenges in post-click behavior +prediction, by incorporating click data. However, prior works mainly focused on +pointwise learning and the orders of labels (i.e., click and post-click) are +not well explored, which naturally poses a listwise learning problem. Inspired +by recent advances on differentiable sorting, in this paper, we propose a novel +multi-task framework that leverages orders of user behaviors to predict user +post-click conversion in an end-to-end approach. Specifically, we define an +aggregation operator to combine predicted outputs of different tasks to a +unified score, then we use the computed scores to model the label relations via +differentiable sorting. Extensive experiments on public and industrial datasets +show the superiority of our proposed model against competitive baselines. + +
+
+ comment: The paper is accepted as a short research paper by SIGIR 2023 +
+
+
+
+
+ + ☆ GraphCL-DTA: a graph contrastive learning with molecular semantics for + drug-target binding affinity prediction + + +
+ Drug-target binding affinity prediction plays an important role in the early +stages of drug discovery, which can infer the strength of interactions between +new drugs and new targets. However, the performance of previous computational +models is limited by the following drawbacks. The learning of drug +representation relies only on supervised data, without taking into account the +information contained in the molecular graph itself. Moreover, most previous +studies tended to design complicated representation learning module, while +uniformity, which is used to measure representation quality, is ignored. In +this study, we propose GraphCL-DTA, a graph contrastive learning with molecular +semantics for drug-target binding affinity prediction. In GraphCL-DTA, we +design a graph contrastive learning framework for molecular graphs to learn +drug representations, so that the semantics of molecular graphs are preserved. +Through this graph contrastive framework, a more essential and effective drug +representation can be learned without additional supervised data. Next, we +design a new loss function that can be directly used to smoothly adjust the +uniformity of drug and target representations. By directly optimizing the +uniformity of representations, the representation quality of drugs and targets +can be improved. The effectiveness of the above innovative elements is verified +on two real datasets, KIBA and Davis. The excellent performance of GraphCL-DTA +on the above datasets suggests its superiority to the state-of-the-art model. + +
+
+ comment: 13 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Sharpness-Aware Graph Collaborative Filtering + + +
+ Graph Neural Networks (GNNs) have achieved impressive performance in +collaborative filtering. However, GNNs tend to yield inferior performance when +the distributions of training and test data are not aligned well. Also, +training GNNs requires optimizing non-convex neural networks with an abundance +of local and global minima, which may differ widely in their performance at +test time. Thus, it is essential to choose the minima carefully. Here we +propose an effective training schema, called {gSAM}, under the principle that +the \textit{flatter} minima has a better generalization ability than the +\textit{sharper} ones. To achieve this goal, gSAM regularizes the flatness of +the weight loss landscape by forming a bi-level optimization: the outer problem +conducts the standard model training while the inner problem helps the model +jump out of the sharp minima. Experimental results show the superiority of our +gSAM. + +
+
+
+
+
+ + ☆ PubMed and Beyond: Recent Advances and Best Practices in Biomedical + Literature Search + + +
+ Biomedical research yields a wealth of information, much of which is only +accessible through the literature. Consequently, literature search is an +essential tool for building on prior knowledge in clinical and biomedical +research. Although recent improvements in artificial intelligence have expanded +functionality beyond keyword-based search, these advances may be unfamiliar to +clinicians and researchers. In response, we present a survey of literature +search tools tailored to both general and specific information needs in +biomedicine, with the objective of helping readers efficiently fulfill their +information needs. We first examine the widely used PubMed search engine, +discussing recent improvements and continued challenges. We then describe +literature search tools catering to five specific information needs: 1. +Identifying high-quality clinical research for evidence-based medicine. 2. +Retrieving gene-related information for precision medicine and genomics. 3. +Searching by meaning, including natural language questions. 4. Locating related +articles with literature recommendation. 5. Mining literature to discover +associations between concepts such as diseases and genetic variants. +Additionally, we cover practical considerations and best practices for choosing +and using these tools. Finally, we provide a perspective on the future of +literature search engines, considering recent breakthroughs in large language +models such as ChatGPT. In summary, our survey provides a comprehensive view of +biomedical literature search functionalities with 36 publicly available tools. + +
+
+ comment: 27 pages, 6 figures, 36 tools +
+
+
+
+
+ + ☆ AutoAlign: Fully Automatic and Effective Knowledge Graph Alignment + enabled by Large Language Models + + +
+ The task of entity alignment between knowledge graphs (KGs) aims to identify +every pair of entities from two different KGs that represent the same entity. +Many machine learning-based methods have been proposed for this task. However, +to our best knowledge, existing methods all require manually crafted seed +alignments, which are expensive to obtain. In this paper, we propose the first +fully automatic alignment method named AutoAlign, which does not require any +manually crafted seed alignments. Specifically, for predicate embeddings, +AutoAlign constructs a predicate-proximity-graph with the help of large +language models to automatically capture the similarity between predicates +across two KGs. For entity embeddings, AutoAlign first computes the entity +embeddings of each KG independently using TransE, and then shifts the two KGs' +entity embeddings into the same vector space by computing the similarity +between entities based on their attributes. Thus, both predicate alignment and +entity alignment can be done without manually crafted seed alignments. +AutoAlign is not only fully automatic, but also highly effective. Experiments +using real-world KGs show that AutoAlign improves the performance of entity +alignment significantly compared to state-of-the-art methods. + +
+
+ comment: 14 pages, 5 figures, 4 tables. arXiv admin note: substantial text + overlap with arXiv:2210.08540 +
+
+
+
+
+ + ♻ ☆ Leveraging Recommender Systems to Reduce Content Gaps on Peer Production + Platforms AAAI + + +
+ Peer production platforms like Wikipedia commonly suffer from content gaps. +Prior research suggests recommender systems can help solve this problem, by +guiding editors towards underrepresented topics. However, it remains unclear +whether this approach would result in less relevant recommendations, leading to +reduced overall engagement with recommended items. To answer this question, we +first conducted offline analyses (Study 1) on SuggestBot, a task-routing +recommender system for Wikipedia, then did a three-month controlled experiment +(Study 2). Our results show that presenting users with articles from +underrepresented topics increased the proportion of work done on those articles +without significantly reducing overall recommendation uptake. We discuss the +implications of our results, including how ignoring the article discovery +process can artificially narrow recommendations. We draw parallels between this +phenomenon and the common issue of "filter bubbles" to show how any platform +that employs recommender systems is susceptible to it. + +
+
+ comment: To appear at the 18th International AAAI Conference on Web and Social + Media (ICWSM 2024) +
+
+
+
+
+ + ♻ ☆ Explainable Conversational Question Answering over Heterogeneous Sources + via Iterative Graph Neural Networks SIGIR 2023 + + +
+ In conversational question answering, users express their information needs +through a series of utterances with incomplete context. Typical ConvQA methods +rely on a single source (a knowledge base (KB), or a text corpus, or a set of +tables), thus being unable to benefit from increased answer coverage and +redundancy of multiple sources. Our method EXPLAIGNN overcomes these +limitations by integrating information from a mixture of sources with +user-comprehensible explanations for answers. It constructs a heterogeneous +graph from entities and evidence snippets retrieved from a KB, a text corpus, +web tables, and infoboxes. This large graph is then iteratively reduced via +graph neural networks that incorporate question-level attention, until the best +answers and their explanations are distilled. Experiments show that EXPLAIGNN +improves performance over state-of-the-art baselines. A user study demonstrates +that derived answers are understandable by end users. + +
+
+ comment: Accepted at SIGIR 2023 (extended version) +
+
+
+
+
+ + ♻ ☆ Controllable Multi-Objective Re-ranking with Policy Hypernetworks + + +
+ Multi-stage ranking pipelines have become widely used strategies in modern +recommender systems, where the final stage aims to return a ranked list of +items that balances a number of requirements such as user preference, +diversity, novelty etc. Linear scalarization is arguably the most widely used +technique to merge multiple requirements into one optimization objective, by +summing up the requirements with certain preference weights. Existing +final-stage ranking methods often adopt a static model where the preference +weights are determined during offline training and kept unchanged during online +serving. Whenever a modification of the preference weights is needed, the model +has to be re-trained, which is time and resources inefficient. Meanwhile, the +most appropriate weights may vary greatly for different groups of targeting +users or at different time periods (e.g., during holiday promotions). In this +paper, we propose a framework called controllable multi-objective re-ranking +(CMR) which incorporates a hypernetwork to generate parameters for a re-ranking +model according to different preference weights. In this way, CMR is enabled to +adapt the preference weights according to the environment changes in an online +manner, without retraining the models. Moreover, we classify practical +business-oriented tasks into four main categories and seamlessly incorporate +them in a new proposed re-ranking model based on an Actor-Evaluator framework, +which serves as a reliable real-world testbed for CMR. Offline experiments +based on the dataset collected from Taobao App showed that CMR improved several +popular re-ranking models by using them as underlying models. Online A/B tests +also demonstrated the effectiveness and trustworthiness of CMR. + +
+
+
+
+
+ + ♻ ☆ Multi-Modal Self-Supervised Learning for Recommendation WWW 2023 + + +
+ The online emergence of multi-modal sharing platforms (eg, TikTok, Youtube) +is powering personalized recommender systems to incorporate various modalities +(eg, visual, textual and acoustic) into the latent user representations. While +existing works on multi-modal recommendation exploit multimedia content +features in enhancing item embeddings, their model representation capability is +limited by heavy label reliance and weak robustness on sparse user behavior +data. Inspired by the recent progress of self-supervised learning in +alleviating label scarcity issue, we explore deriving self-supervision signals +with effectively learning of modality-aware user preference and cross-modal +dependencies. To this end, we propose a new Multi-Modal Self-Supervised +Learning (MMSSL) method which tackles two key challenges. Specifically, to +characterize the inter-dependency between the user-item collaborative view and +item multi-modal semantic view, we design a modality-aware interactive +structure learning paradigm via adversarial perturbations for data +augmentation. In addition, to capture the effects that user's modality-aware +interaction pattern would interweave with each other, a cross-modal contrastive +learning approach is introduced to jointly preserve the inter-modal semantic +commonality and user preference diversity. Experiments on real-world datasets +verify the superiority of our method in offering great potential for multimedia +recommendation over various state-of-the-art baselines. The implementation is +released at: https://github.com/HKUDS/MMSSL. + +
+
+ comment: This paper has been published as a full paper at WWW 2023 +
+
+
+
+
+ + ♻ ☆ Unified Off-Policy Learning to Rank: a Reinforcement Learning + Perspective + + +
+ Off-policy Learning to Rank (LTR) aims to optimize a ranker from data +collected by a deployed logging policy. However, existing off-policy learning +to rank methods often make strong assumptions about how users generate the +click data, i.e., the click model, and hence need to tailor their methods +specifically under different click models. In this paper, we unified the +ranking process under general stochastic click models as a Markov Decision +Process (MDP), and the optimal ranking could be learned with offline +reinforcement learning (RL) directly. Building upon this, we leverage offline +RL techniques for off-policy LTR and propose the Click Model-Agnostic Unified +Off-policy Learning to Rank (CUOLR) method, which could be easily applied to a +wide range of click models. Through a dedicated formulation of the MDP, we show +that offline RL algorithms can adapt to various click models without complex +debiasing techniques and prior knowledge of the model. Results on various +large-scale datasets demonstrate that CUOLR consistently outperforms the +state-of-the-art off-policy learning to rank algorithms while maintaining +consistency and robustness under different click models. + +
+
+
+
+
+
+
+
+ + Machine Learning 159 + +
+
+
+ + ☆ Forecasting the steam mass flow in a powerplant using the parallel + hybrid network + + +
+ Efficient and sustainable power generation is a crucial concern in the energy +sector. In particular, thermal power plants grapple with accurately predicting +steam mass flow, which is crucial for operational efficiency and cost +reduction. In this study, we use a parallel hybrid neural network architecture +that combines a parametrized quantum circuit and a conventional feed-forward +neural network specifically designed for time-series prediction in industrial +settings to enhance predictions of steam mass flow 15 minutes into the future. +Our results show that the parallel hybrid model outperforms standalone +classical and quantum models, achieving more than 5.7 and 4.9 times lower mean +squared error (MSE) loss on the test set after training compared to pure +classical and pure quantum networks, respectively. Furthermore, the hybrid +model demonstrates smaller relative errors between the ground truth and the +model predictions on the test set, up to 2 times better than the pure classical +model. These findings contribute to the broader scientific understanding of how +integrating quantum and classical machine learning techniques can be applied to +real-world challenges faced by the energy sector, ultimately leading to +optimized power plant operations. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Overthinking the Truth: Understanding how Language Models Process False + Demonstrations + + +
+ Modern language models can imitate complex patterns through few-shot +learning, enabling them to complete challenging tasks without fine-tuning. +However, imitation can also lead models to reproduce inaccuracies or harmful +content if present in the context. We study harmful imitation through the lens +of a model's internal representations, and identify two related phenomena: +overthinking and false induction heads. The first phenomenon, overthinking, +appears when we decode predictions from intermediate layers, given correct vs. +incorrect few-shot demonstrations. At early layers, both demonstrations induce +similar model behavior, but the behavior diverges sharply at some "critical +layer", after which the accuracy given incorrect demonstrations progressively +decreases. The second phenomenon, false induction heads, are a possible +mechanistic cause of overthinking: these are heads in late layers that attend +to and copy false information from previous demonstrations, and whose ablation +reduces overthinking. Beyond scientific understanding, our results suggest that +studying intermediate model computations could be a promising avenue for +understanding and guarding against harmful model behaviors. + +
+
+
+
+
+ + ☆ A Cryogenic Memristive Neural Decoder for Fault-tolerant Quantum Error + Correction + + +
+ Neural decoders for quantum error correction (QEC) rely on neural networks to +classify syndromes extracted from error correction codes and find appropriate +recovery operators to protect logical information against errors. Despite the +good performance of neural decoders, important practical requirements remain to +be achieved, such as minimizing the decoding time to meet typical rates of +syndrome generation in repeated error correction schemes, and ensuring the +scalability of the decoding approach as the code distance increases. Designing +a dedicated integrated circuit to perform the decoding task in co-integration +with a quantum processor appears necessary to reach these decoding time and +scalability requirements, as routing signals in and out of a cryogenic +environment to be processed externally leads to unnecessary delays and an +eventual wiring bottleneck. In this work, we report the design and performance +analysis of a neural decoder inference accelerator based on an in-memory +computing (IMC) architecture, where crossbar arrays of resistive memory devices +are employed to both store the synaptic weights of the decoder neural network +and perform analog matrix-vector multiplications during inference. In +proof-of-concept numerical experiments supported by experimental measurements, +we investigate the impact of TiO$_\textrm{x}$-based memristive devices' +non-idealities on decoding accuracy. Hardware-aware training methods are +developed to mitigate the loss in accuracy, allowing the memristive neural +decoders to achieve a pseudo-threshold of $9.23\times 10^{-4}$ for the +distance-three surface code, whereas the equivalent digital neural decoder +achieves a pseudo-threshold of $1.01\times 10^{-3}$. This work provides a +pathway to scalable, fast, and low-power cryogenic IMC hardware for integrated +QEC. + +
+
+
+
+
+ + ☆ Does Circuit Analysis Interpretability Scale? Evidence from Multiple + Choice Capabilities in Chinchilla + + +
+ \emph{Circuit analysis} is a promising technique for understanding the +internal mechanisms of language models. However, existing analyses are done in +small models far from the state of the art. To address this, we present a case +study of circuit analysis in the 70B Chinchilla model, aiming to test the +scalability of circuit analysis. In particular, we study multiple-choice +question answering, and investigate Chinchilla's capability to identify the +correct answer \emph{label} given knowledge of the correct answer \emph{text}. +We find that the existing techniques of logit attribution, attention pattern +visualization, and activation patching naturally scale to Chinchilla, allowing +us to identify and categorize a small set of `output nodes' (attention heads +and MLPs). + We further study the `correct letter' category of attention heads aiming to +understand the semantics of their features, with mixed results. For normal +multiple-choice question answers, we significantly compress the query, key and +value subspaces of the head without loss of performance when operating on the +answer labels for multiple-choice questions, and we show that the query and key +subspaces represent an `Nth item in an enumeration' feature to at least some +extent. However, when we attempt to use this explanation to understand the +heads' behaviour on a more general distribution including randomized answer +labels, we find that it is only a partial explanation, suggesting there is more +to learn about the operation of `correct letter' heads on multiple choice +question answering. + +
+
+
+
+
+ + ☆ Smooth Attention for Deep Multiple Instance Learning: Application to CT + Intracranial Hemorrhage Detection + + +
+ Multiple Instance Learning (MIL) has been widely applied to medical imaging +diagnosis, where bag labels are known and instance labels inside bags are +unknown. Traditional MIL assumes that instances in each bag are independent +samples from a given distribution. However, instances are often spatially or +sequentially ordered, and one would expect similar diagnostic importance for +neighboring instances. To address this, in this study, we propose a smooth +attention deep MIL (SA-DMIL) model. Smoothness is achieved by the introduction +of first and second order constraints on the latent function encoding the +attention paid to each instance in a bag. The method is applied to the +detection of intracranial hemorrhage (ICH) on head CT scans. The results show +that this novel SA-DMIL: (a) achieves better performance than the non-smooth +attention MIL at both scan (bag) and slice (instance) levels; (b) learns +spatial dependencies between slices; and (c) outperforms current +state-of-the-art MIL methods on the same ICH test set. + +
+
+
+
+
+ + ☆ Convergent regularization in inverse problems and linear plug-and-play + denoisers + + +
+ Plug-and-play (PnP) denoising is a popular iterative framework for solving +imaging inverse problems using off-the-shelf image denoisers. Their empirical +success has motivated a line of research that seeks to understand the +convergence of PnP iterates under various assumptions on the denoiser. While a +significant amount of research has gone into establishing the convergence of +the PnP iteration for different regularity conditions on the denoisers, not +much is known about the asymptotic properties of the converged solution as the +noise level in the measurement tends to zero, i.e., whether PnP methods are +provably convergent regularization schemes under reasonable assumptions on the +denoiser. This paper serves two purposes: first, we provide an overview of the +classical regularization theory in inverse problems and survey a few notable +recent data-driven methods that are provably convergent regularization schemes. +We then continue to discuss PnP algorithms and their established convergence +guarantees. Subsequently, we consider PnP algorithms with linear denoisers and +propose a novel spectral filtering technique to control the strength of +regularization arising from the denoiser. Further, by relating the implicit +regularization of the denoiser to an explicit regularization functional, we +rigorously show that PnP with linear denoisers leads to a convergent +regularization scheme. More specifically, we prove that in the limit as the +noise vanishes, the PnP reconstruction converges to the minimizer of a +regularization potential subject to the solution satisfying the noiseless +operator equation. The theoretical analysis is corroborated by numerical +experiments for the classical inverse problem of tomographic image +reconstruction. + +
+
+
+
+
+ + ☆ Unsupervised Conditional Slot Attention for Object Centric Learning + + +
+ Extracting object-level representations for downstream reasoning tasks is an +emerging area in AI. Learning object-centric representations in an unsupervised +setting presents multiple challenges, a key one being binding an arbitrary +number of object instances to a specialized object slot. Recent object-centric +representation methods like Slot Attention utilize iterative attention to learn +composable representations with dynamic inference level binding but fail to +achieve specialized slot level binding. To address this, in this paper we +propose Unsupervised Conditional Slot Attention using a novel Probabilistic +Slot Dictionary (PSD). We define PSD with (i) abstract object-level property +vectors as key and (ii) parametric Gaussian distribution as its corresponding +value. We demonstrate the benefits of the learnt specific object-level +conditioning distributions in multiple downstream tasks, namely object +discovery, compositional scene generation, and compositional visual reasoning. +We show that our method provides scene composition capabilities and a +significant boost in a few shot adaptability tasks of compositional visual +reasoning, while performing similarly or better than slot attention in object +discovery tasks + +
+
+
+
+
+ + ☆ Scaling Laws for Imitation Learning in NetHack + + +
+ Imitation Learning (IL) is one of the most widely used methods in machine +learning. Yet, while powerful, many works find it is often not able to fully +recover the underlying expert behavior. However, none of these works deeply +investigate the role of scaling up the model and data size. Inspired by recent +work in Natural Language Processing (NLP) where "scaling up" has resulted in +increasingly more capable LLMs, we investigate whether carefully scaling up +model and data size can bring similar improvements in the imitation learning +setting. To demonstrate our findings, we focus on the game of NetHack, a +challenging environment featuring procedural generation, stochasticity, +long-term dependencies, and partial observability. We find IL loss and mean +return scale smoothly with the compute budget and are strongly correlated, +resulting in power laws for training compute-optimal IL agents with respect to +model size and number of samples. We forecast and train several NetHack agents +with IL and find they outperform prior state-of-the-art by at least 2x in all +settings. Our work both demonstrates the scaling behavior of imitation learning +in a challenging domain, as well as the viability of scaling up current +approaches for increasingly capable agents in NetHack, a game that remains +elusively hard for current AI systems. + +
+
+
+
+
+ + ☆ Online Learning with Costly Features in Non-stationary Environments + + +
+ Maximizing long-term rewards is the primary goal in sequential +decision-making problems. The majority of existing methods assume that side +information is freely available, enabling the learning agent to observe all +features' states before making a decision. In real-world problems, however, +collecting beneficial information is often costly. That implies that, besides +individual arms' reward, learning the observations of the features' states is +essential to improve the decision-making strategy. The problem is aggravated in +a non-stationary environment where reward and cost distributions undergo abrupt +changes over time. To address the aforementioned dual learning problem, we +extend the contextual bandit setting and allow the agent to observe subsets of +features' states. The objective is to maximize the long-term average gain, +which is the difference between the accumulated rewards and the paid costs on +average. Therefore, the agent faces a trade-off between minimizing the cost of +information acquisition and possibly improving the decision-making process +using the obtained information. To this end, we develop an algorithm that +guarantees a sublinear regret in time. Numerical results demonstrate the +superiority of our proposed policy in a real-world scenario. + +
+
+ comment: 31 pages, 6 figures +
+
+
+
+
+ + ☆ Batched Predictors Generalize within Distribution + + +
+ We study the generalization properties of batched predictors, i.e., models +tasked with predicting the mean label of a small set (or batch) of examples. +The batched prediction paradigm is particularly relevant for models deployed to +determine the quality of a group of compounds in preparation for offline +testing. By utilizing a suitable generalization of the Rademacher complexity, +we prove that batched predictors come with exponentially stronger +generalization guarantees as compared to the standard per-sample approach. +Surprisingly, the proposed bound holds independently of overparametrization. +Our theoretical insights are validated experimentally for various tasks, +architectures, and applications. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Data Cross-Segmentation for Improved Generalization in Reinforcement + Learning Based Algorithmic Trading + + +
+ The use of machine learning in algorithmic trading systems is increasingly +common. In a typical set-up, supervised learning is used to predict the future +prices of assets, and those predictions drive a simple trading and execution +strategy. This is quite effective when the predictions have sufficient signal, +markets are liquid, and transaction costs are low. However, those conditions +often do not hold in thinly traded financial markets and markets for +differentiated assets such as real estate or vehicles. In these markets, the +trading strategy must consider the long-term effects of taking positions that +are relatively more difficult to change. In this work, we propose a +Reinforcement Learning (RL) algorithm that trades based on signals from a +learned predictive model and addresses these challenges. We test our algorithm +on 20+ years of equity data from Bursa Malaysia. + +
+
+
+
+
+ + ☆ Enhancing Pattern Classification in Support Vector Machines through + Matrix Formulation + + +
+ Support Vector Machines (SVM) have gathered significant acclaim as +classifiers due to their successful implementation of Statistical Learning +Theory. However, in the context of multiclass and multilabel settings, the +reliance on vector-based formulations in existing SVM-based models poses +limitations regarding flexibility and ease of incorporating additional terms to +handle specific challenges. To overcome these limitations, our research paper +focuses on introducing a matrix formulation for SVM that effectively addresses +these constraints. By employing the Accelerated Gradient Descent method in the +dual, we notably enhance the efficiency of solving the Matrix-SVM problem. +Experimental evaluations on multilabel and multiclass datasets demonstrate that +Matrix SVM achieves superior time efficacy while delivering similar results to +Binary Relevance SVM. + Moreover, our matrix formulation unveils crucial insights and advantages that +may not be readily apparent in traditional vector-based notations. We emphasize +that numerous multilabel models can be viewed as extensions of SVM, with +customised modifications to meet specific requirements. The matrix formulation +presented in this paper establishes a solid foundation for developing more +sophisticated models capable of effectively addressing the distinctive +challenges encountered in multilabel learning. + +
+
+
+
+
+ + ☆ Sparse Gaussian Graphical Models with Discrete Optimization: + Computational and Statistical Perspectives + + +
+ We consider the problem of learning a sparse graph underlying an undirected +Gaussian graphical model, a key problem in statistical machine learning. Given +$n$ samples from a multivariate Gaussian distribution with $p$ variables, the +goal is to estimate the $p \times p$ inverse covariance matrix (aka precision +matrix), assuming it is sparse (i.e., has a few nonzero entries). We propose +GraphL0BnB, a new estimator based on an $\ell_0$-penalized version of the +pseudolikelihood function, while most earlier approaches are based on the +$\ell_1$-relaxation. Our estimator can be formulated as a convex mixed integer +program (MIP) which can be difficult to compute at scale using off-the-shelf +commercial solvers. To solve the MIP, we propose a custom nonlinear +branch-and-bound (BnB) framework that solves node relaxations with tailored +first-order methods. As a by-product of our BnB framework, we propose +large-scale solvers for obtaining good primal solutions that are of independent +interest. We derive novel statistical guarantees (estimation and variable +selection) for our estimator and discuss how our approach improves upon +existing estimators. Our numerical experiments on real/synthetic datasets +suggest that our method can solve, to near-optimality, problem instances with +$p = 10^4$ -- corresponding to a symmetric matrix of size $p \times p$ with +$p^2/2$ binary variables. We demonstrate the usefulness of GraphL0BnB versus +various state-of-the-art approaches on a range of datasets. + +
+
+
+
+
+ + ☆ An Evaluation of Zero-Cost Proxies -- from Neural Architecture + Performance to Model Robustness + + +
+ Zero-cost proxies are nowadays frequently studied and used to search for +neural architectures. They show an impressive ability to predict the +performance of architectures by making use of their untrained weights. These +techniques allow for immense search speed-ups. So far the joint search for +well-performing and robust architectures has received much less attention in +the field of NAS. Therefore, the main focus of zero-cost proxies is the clean +accuracy of architectures, whereas the model robustness should play an evenly +important part. In this paper, we analyze the ability of common zero-cost +proxies to serve as performance predictors for robustness in the popular +NAS-Bench-201 search space. We are interested in the single prediction task for +robustness and the joint multi-objective of clean and robust accuracy. We +further analyze the feature importance of the proxies and show that predicting +the robustness makes the prediction task from existing zero-cost proxies more +challenging. As a result, the joint consideration of several proxies becomes +necessary to predict a model's robustness while the clean accuracy can be +regressed from a single such feature. + +
+
+ comment: Accepted at DAGM GCPR 2023 +
+
+
+
+
+ + ☆ MOCA: Self-supervised Representation Learning by Predicting Masked + Online Codebook Assignments + + +
+ Self-supervised learning can be used for mitigating the greedy needs of +Vision Transformer networks for very large fully-annotated datasets. Different +classes of self-supervised learning offer representations with either good +contextual reasoning properties, e.g., using masked image modeling strategies, +or invariance to image perturbations, e.g., with contrastive methods. In this +work, we propose a single-stage and standalone method, MOCA, which unifies both +desired properties using novel mask-and-predict objectives defined with +high-level features (instead of pixel-level details). Moreover, we show how to +effectively employ both learning paradigms in a synergistic and +computation-efficient way. Doing so, we achieve new state-of-the-art results on +low-shot settings and strong experimental results in various evaluation +protocols with a training that is at least 3 times faster than prior methods. + +
+
+
+
+
+ + ☆ Using the IBM Analog In-Memory Hardware Acceleration Kit for Neural + Network Training and Inference + + +
+ Analog In-Memory Computing (AIMC) is a promising approach to reduce the +latency and energy consumption of Deep Neural Network (DNN) inference and +training. However, the noisy and non-linear device characteristics, and the +non-ideal peripheral circuitry in AIMC chips, require adapting DNNs to be +deployed on such hardware to achieve equivalent accuracy to digital computing. +In this tutorial, we provide a deep dive into how such adaptations can be +achieved and evaluated using the recently released IBM Analog Hardware +Acceleration Kit (AIHWKit), freely available at https://github.com/IBM/aihwkit. +The AIHWKit is a Python library that simulates inference and training of DNNs +using AIMC. We present an in-depth description of the AIHWKit design, +functionality, and best practices to properly perform inference and training. +We also present an overview of the Analog AI Cloud Composer, that provides the +benefits of using the AIHWKit simulation platform in a fully managed cloud +setting. Finally, we show examples on how users can expand and customize +AIHWKit for their own needs. This tutorial is accompanied by comprehensive +Jupyter Notebook code examples that can be run using AIHWKit, which can be +downloaded from https://github.com/IBM/aihwkit/tree/master/notebooks/tutorial. + +
+
+
+
+
+ + ☆ Learning to Select SAT Encodings for Pseudo-Boolean and Linear Integer + Constraints + + +
+ Many constraint satisfaction and optimisation problems can be solved +effectively by encoding them as instances of the Boolean Satisfiability problem +(SAT). However, even the simplest types of constraints have many encodings in +the literature with widely varying performance, and the problem of selecting +suitable encodings for a given problem instance is not trivial. We explore the +problem of selecting encodings for pseudo-Boolean and linear constraints using +a supervised machine learning approach. We show that it is possible to select +encodings effectively using a standard set of features for constraint problems; +however we obtain better performance with a new set of features specifically +designed for the pseudo-Boolean and linear constraints. In fact, we achieve +good results when selecting encodings for unseen problem classes. Our results +compare favourably to AutoFolio when using the same feature set. We discuss the +relative importance of instance features to the task of selecting the best +encodings, and compare several variations of the machine learning method. + +
+
+ comment: 24 pages, 10 figures, submitted to Constraints Journal (Springer) +
+
+
+
+
+ + ☆ Exploiting Field Dependencies for Learning on Categorical Data + + +
+ Traditional approaches for learning on categorical data underexploit the +dependencies between columns (\aka fields) in a dataset because they rely on +the embedding of data points driven alone by the classification/regression +loss. In contrast, we propose a novel method for learning on categorical data +with the goal of exploiting dependencies between fields. Instead of modelling +statistics of features globally (i.e., by the covariance matrix of features), +we learn a global field dependency matrix that captures dependencies between +fields and then we refine the global field dependency matrix at the +instance-wise level with different weights (so-called local dependency +modelling) w.r.t. each field to improve the modelling of the field +dependencies. Our algorithm exploits the meta-learning paradigm, i.e., the +dependency matrices are refined in the inner loop of the meta-learning +algorithm without the use of labels, whereas the outer loop intertwines the +updates of the embedding matrix (the matrix performing projection) and global +dependency matrix in a supervised fashion (with the use of labels). Our method +is simple yet it outperforms several state-of-the-art methods on six popular +dataset benchmarks. Detailed ablation studies provide additional insights into +our method. + +
+
+ comment: IEEE Transactions on Pattern Analysis and Machine Intelligence + (submitted June 2022, accepted July 2023) +
+
+
+
+
+ + ☆ Biomaker CA: a Biome Maker project using Cellular Automata + + +
+ We introduce Biomaker CA: a Biome Maker project using Cellular Automata (CA). +In Biomaker CA, morphogenesis is a first class citizen and small seeds need to +grow into plant-like organisms to survive in a nutrient starved environment and +eventually reproduce with variation so that a biome survives for long +timelines. We simulate complex biomes by means of CA rules in 2D grids and +parallelize all of its computation on GPUs through the Python JAX framework. We +show how this project allows for several different kinds of environments and +laws of 'physics', alongside different model architectures and mutation +strategies. We further analyze some configurations to show how plant agents can +grow, survive, reproduce, and evolve, forming stable and unstable biomes. We +then demonstrate how one can meta-evolve models to survive in a harsh +environment either through end-to-end meta-evolution or by a more surgical and +efficient approach, called Petri dish meta-evolution. Finally, we show how to +perform interactive evolution, where the user decides how to evolve a plant +model interactively and then deploys it in a larger environment. We open source +Biomaker CA at: https://tinyurl.com/2x8yu34s . + +
+
+ comment: 20 pages, 23 figures. For code base, see https://tinyurl.com/2x8yu34s +
+
+
+
+
+ + ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks. In contrast to traditional text-only methods, our approach to +labelling a comment as hate speech centers around the holistic analysis of text +and images. This is done by leveraging graph transformers to capture the +contextual relationships in the entire discussion that surrounds a comment, +with interwoven fusion layers to combine text and image embeddings instead of +processing different modalities separately. We compare the performance of our +model to baselines that only process text; we also conduct extensive ablation +studies. We conclude with future work for multimodal solutions to deliver +social value in online contexts, arguing that capturing a holistic view of a +conversation greatly advances the effort to detect anti-social behavior. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ☆ Automatic Differentiation for Inverse Problems with Applications in + Quantum Transport + + +
+ A neural solver and differentiable simulation of the quantum transmitting +boundary model is presented for the inverse quantum transport problem. The +neural solver is used to engineer continuous transmission properties and the +differentiable simulation is used to engineer current-voltage characteristics. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory + Forecasting ICCV 2023 + + +
+ Capturing high-dimensional social interactions and feasible futures is +essential for predicting trajectories. To address this complex nature, several +attempts have been devoted to reducing the dimensionality of the output +variables via parametric curve fitting such as the B\'ezier curve and B-spline +function. However, these functions, which originate in computer graphics +fields, are not suitable to account for socially acceptable human dynamics. In +this paper, we present EigenTrajectory ($\mathbb{ET}$), a trajectory prediction +approach that uses a novel trajectory descriptor to form a compact space, known +here as $\mathbb{ET}$ space, in place of Euclidean space, for representing +pedestrian movements. We first reduce the complexity of the trajectory +descriptor via a low-rank approximation. We transform the pedestrians' history +paths into our $\mathbb{ET}$ space represented by spatio-temporal principle +components, and feed them into off-the-shelf trajectory forecasting models. The +inputs and outputs of the models as well as social interactions are all +gathered and aggregated in the corresponding $\mathbb{ET}$ space. Lastly, we +propose a trajectory anchor-based refinement method to cover all possible +futures in the proposed $\mathbb{ET}$ space. Extensive experiments demonstrate +that our EigenTrajectory predictor can significantly improve both the +prediction accuracy and reliability of existing trajectory forecasting models +on public benchmarks, indicating that the proposed descriptor is suited to +represent pedestrian behaviors. Code is publicly available at +https://github.com/inhwanbae/EigenTrajectory . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Conformal prediction under ambiguous ground truth + + +
+ In safety-critical classification tasks, conformal prediction allows to +perform rigorous uncertainty quantification by providing confidence sets +including the true class with a user-specified probability. This generally +assumes the availability of a held-out calibration set with access to ground +truth labels. Unfortunately, in many domains, such labels are difficult to +obtain and usually approximated by aggregating expert opinions. In fact, this +holds true for almost all datasets, including well-known ones such as CIFAR and +ImageNet. Applying conformal prediction using such labels underestimates +uncertainty. Indeed, when expert opinions are not resolvable, there is inherent +ambiguity present in the labels. That is, we do not have ``crisp'', definitive +ground truth labels and this uncertainty should be taken into account during +calibration. In this paper, we develop a conformal prediction framework for +such ambiguous ground truth settings which relies on an approximation of the +underlying posterior distribution of labels given inputs. We demonstrate our +methodology on synthetic and real datasets, including a case study of skin +condition classification in dermatology. + +
+
+
+
+
+ + ☆ FlexiAST: Flexibility is What AST Needs + + +
+ The objective of this work is to give patch-size flexibility to Audio +Spectrogram Transformers (AST). Recent advancements in ASTs have shown superior +performance in various audio-based tasks. However, the performance of standard +ASTs degrades drastically when evaluated using different patch sizes from that +used during training. As a result, AST models are typically re-trained to +accommodate changes in patch sizes. To overcome this limitation, this paper +proposes a training procedure to provide flexibility to standard AST models +without architectural changes, allowing them to work with various patch sizes +at the inference stage - FlexiAST. This proposed training approach simply +utilizes random patch size selection and resizing of patch and positional +embedding weights. Our experiments show that FlexiAST gives similar performance +to standard AST models while maintaining its evaluation ability at various +patch sizes on different datasets for audio classification tasks. + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ☆ End-to-End Neural Network Training for Hyperbox-Based Classification + + +
+ Hyperbox-based classification has been seen as a promising technique in which +decisions on the data are represented as a series of orthogonal, +multidimensional boxes (i.e., hyperboxes) that are often interpretable and +human-readable. However, existing methods are no longer capable of efficiently +handling the increasing volume of data many application domains face nowadays. +We address this gap by proposing a novel, fully differentiable framework for +hyperbox-based classification via neural networks. In contrast to previous +work, our hyperbox models can be efficiently trained in an end-to-end fashion, +which leads to significantly reduced training times and superior classification +results. + +
+
+ comment: 6 pages, accepted for poster presentation at ESANN 2023 +
+
+
+
+
+ + ☆ Mobility-Aware Joint User Scheduling and Resource Allocation for Low + Latency Federated Learning + + +
+ As an efficient distributed machine learning approach, Federated learning +(FL) can obtain a shared model by iterative local model training at the user +side and global model aggregating at the central server side, thereby +protecting privacy of users. Mobile users in FL systems typically communicate +with base stations (BSs) via wireless channels, where training performance +could be degraded due to unreliable access caused by user mobility. However, +existing work only investigates a static scenario or random initialization of +user locations, which fail to capture mobility in real-world networks. To +tackle this issue, we propose a practical model for user mobility in FL across +multiple BSs, and develop a user scheduling and resource allocation method to +minimize the training delay with constrained communication resources. +Specifically, we first formulate an optimization problem with user mobility +that jointly considers user selection, BS assignment to users, and bandwidth +allocation to minimize the latency in each communication round. This +optimization problem turned out to be NP-hard and we proposed a delay-aware +greedy search algorithm (DAGSA) to solve it. Simulation results show that the +proposed algorithm achieves better performance than the state-of-the-art +baselines and a certain level of user mobility could improve training +performance. + +
+
+
+
+
+ + ☆ Adaptive Topological Feature via Persistent Homology: Filtration + Learning for Point Clouds + + +
+ Machine learning for point clouds has been attracting much attention, with +many applications in various fields, such as shape recognition and material +science. To enhance the accuracy of such machine learning methods, it is known +to be effective to incorporate global topological features, which are typically +extracted by persistent homology. In the calculation of persistent homology for +a point cloud, we need to choose a filtration for the point clouds, an +increasing sequence of spaces. Because the performance of machine learning +methods combined with persistent homology is highly affected by the choice of a +filtration, we need to tune it depending on data and tasks. In this paper, we +propose a framework that learns a filtration adaptively with the use of neural +networks. In order to make the resulting persistent homology +isometry-invariant, we develop a neural network architecture with such +invariance. Additionally, we theoretically show a finite-dimensional +approximation result that justifies our architecture. Experimental results +demonstrated the efficacy of our framework in several classification tasks. + +
+
+ comment: 17 pages with 4 figures +
+
+
+
+
+ + ☆ PAC Neural Prediction Set Learning to Quantify the Uncertainty of + Generative Language Models + + +
+ Uncertainty learning and quantification of models are crucial tasks to +enhance the trustworthiness of the models. Importantly, the recent surge of +generative language models (GLMs) emphasizes the need for reliable uncertainty +quantification due to the concerns on generating hallucinated facts. In this +paper, we propose to learn neural prediction set models that comes with the +probably approximately correct (PAC) guarantee for quantifying the uncertainty +of GLMs. Unlike existing prediction set models, which are parameterized by a +scalar value, we propose to parameterize prediction sets via neural networks, +which achieves more precise uncertainty quantification but still satisfies the +PAC guarantee. We demonstrate the efficacy of our method on four types of +language datasets and six types of models by showing that our method improves +the quantified uncertainty by $63\%$ on average, compared to a standard +baseline method. + +
+
+
+
+
+ + ☆ UniTabE: Pretraining a Unified Tabular Encoder for Heterogeneous Tabular + Data + + +
+ Recent advancements in Natural Language Processing (NLP) have witnessed the +groundbreaking impact of pretrained models, yielding impressive outcomes across +various tasks. This study seeks to extend the power of pretraining +methodologies to tabular data, a domain traditionally overlooked, yet +inherently challenging due to the plethora of table schemas intrinsic to +different tasks. The primary research questions underpinning this work revolve +around the adaptation to heterogeneous table structures, the establishment of a +universal pretraining protocol for tabular data, the generalizability and +transferability of learned knowledge across tasks, the adaptation to diverse +downstream applications, and the incorporation of incremental columns over +time. In response to these challenges, we introduce UniTabE, a pioneering +method designed to process tables in a uniform manner, devoid of constraints +imposed by specific table structures. UniTabE's core concept relies on +representing each basic table element with a module, termed TabUnit. This is +subsequently followed by a Transformer encoder to refine the representation. +Moreover, our model is designed to facilitate pretraining and finetuning +through the utilization of free-form prompts. In order to implement the +pretraining phase, we curated an expansive tabular dataset comprising +approximately 13 billion samples, meticulously gathered from the Kaggle +platform. Rigorous experimental testing and analyses were performed under a +myriad of scenarios to validate the effectiveness of our methodology. The +experimental results demonstrate UniTabE's superior performance against several +baseline models across a multitude of benchmark datasets. This, therefore, +underscores UniTabE's potential to significantly enhance the semantic +representation of tabular data, thereby marking a significant stride in the +field of tabular data analysis. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Application of BERT in Wind Power Forecasting-Teletraan's Solution in + Baidu KDD Cup 2022 + + +
+ Nowadays, wind energy has drawn increasing attention as its important role in +carbon neutrality and sustainable development. When wind power is integrated +into the power grid, precise forecasting is necessary for the sustainability +and security of the system. However, the unpredictable nature and long sequence +prediction make it especially challenging. In this technical report, we +introduce the BERT model applied for Baidu KDD Cup 2022, and the daily +fluctuation is added by post-processing to make the predicted results in line +with daily periodicity. Our solution achieves 3rd place of 2490 teams. The code +is released athttps://github.com/LongxingTan/KDD2022-Baidu + +
+
+
+
+
+ + ☆ Towards Sustainable Deep Learning for Multi-Label Classification on NILM + + +
+ Non-intrusive load monitoring (NILM) is the process of obtaining +appliance-level data from a single metering point, measuring total electricity +consumption of a household or a business. Appliance-level data can be directly +used for demand response applications and energy management systems as well as +for awareness raising and motivation for improvements in energy efficiency and +reduction in the carbon footprint. Recently, classical machine learning and +deep learning (DL) techniques became very popular and proved as highly +effective for NILM classification, but with the growing complexity these +methods are faced with significant computational and energy demands during both +their training and operation. In this paper, we introduce a novel DL model +aimed at enhanced multi-label classification of NILM with improved computation +and energy efficiency. We also propose a testing methodology for comparison of +different models using data synthesized from the measurement datasets so as to +better represent real-world scenarios. Compared to the state-of-the-art, the +proposed model has its carbon footprint reduced by more than 23% while +providing on average approximately 8 percentage points in performance +improvement when testing on data derived from REFIT and UK-DALE datasets. + +
+
+
+
+
+ + ☆ Fusing Hand and Body Skeletons for Human Action Recognition in Assembly ICANN + + +
+ As collaborative robots (cobots) continue to gain popularity in industrial +manufacturing, effective human-robot collaboration becomes crucial. Cobots +should be able to recognize human actions to assist with assembly tasks and act +autonomously. To achieve this, skeleton-based approaches are often used due to +their ability to generalize across various people and environments. Although +body skeleton approaches are widely used for action recognition, they may not +be accurate enough for assembly actions where the worker's fingers and hands +play a significant role. To address this limitation, we propose a method in +which less detailed body skeletons are combined with highly detailed hand +skeletons. We investigate CNNs and transformers, the latter of which are +particularly adept at extracting and combining important information from both +skeleton types using attention. This paper demonstrates the effectiveness of +our proposed approach in enhancing action recognition in assembly scenarios. + +
+
+ comment: International Conference on Artificial Neural Networks (ICANN) 2023 +
+
+
+
+
+ + ☆ Detecting Throat Cancer from Speech Signals Using Machine Learning: A + Reproducible Literature Review + + +
+ In this work we perform a scoping review of the current literature on the +detection of throat cancer from speech recordings using machine learning and +artificial intelligence. We find 22 papers within this area and discuss their +methods and results. We split these papers into two groups - nine performing +binary classification, and 13 performing multi-class classification. The papers +present a range of methods with neural networks being most commonly +implemented. Many features are also extracted from the audio before +classification, with the most common bring mel-frequency cepstral coefficients. +None of the papers found in this search have associated code repositories and +as such are not reproducible. Therefore, we create a publicly available code +repository of our own classifiers. We use transfer learning on a multi-class +problem, classifying three pathologies and healthy controls. Using this +technique we achieve an unweighted average recall of 53.54%, sensitivity of +83.14%, and specificity of 64.00%. We compare our classifiers with the results +obtained on the same dataset and find similar results. + +
+
+ comment: 19 pages, 10 figures +
+
+
+
+
+ + ☆ How Many Neurons Does it Take to Approximate the Maximum? + + +
+ We study the size of a neural network needed to approximate the maximum +function over $d$ inputs, in the most basic setting of approximating with +respect to the $L_2$ norm, for continuous distributions, for a network that +uses ReLU activations. We provide new lower and upper bounds on the width +required for approximation across various depths. Our results establish new +depth separations between depth 2 and 3, and depth 3 and 5 networks, as well as +providing a depth $\mathcal{O}(\log(\log(d)))$ and width $\mathcal{O}(d)$ +construction which approximates the maximum function, significantly improving +upon the depth requirements of the best previously known bounds for networks +with linearly-bounded width. Our depth separation results are facilitated by a +new lower bound for depth 2 networks approximating the maximum function over +the uniform distribution, assuming an exponential upper bound on the size of +the weights. Furthermore, we are able to use this depth 2 lower bound to +provide tight bounds on the number of neurons needed to approximate the maximum +by a depth 3 network. Our lower bounds are of potentially broad interest as +they apply to the widely studied and used \emph{max} function, in contrast to +many previous results that base their bounds on specially constructed or +pathological functions and distributions. + +
+
+
+
+
+ + ☆ Automated Ableism: An Exploration of Explicit Disability Biases in + Sentiment and Toxicity Analysis Models ACL 2023 + + +
+ We analyze sentiment analysis and toxicity detection models to detect the +presence of explicit bias against people with disability (PWD). We employ the +bias identification framework of Perturbation Sensitivity Analysis to examine +conversations related to PWD on social media platforms, specifically Twitter +and Reddit, in order to gain insight into how disability bias is disseminated +in real-world social settings. We then create the \textit{Bias Identification +Test in Sentiment} (BITS) corpus to quantify explicit disability bias in any +sentiment analysis and toxicity detection models. Our study utilizes BITS to +uncover significant biases in four open AIaaS (AI as a Service) sentiment +analysis tools, namely TextBlob, VADER, Google Cloud Natural Language API, +DistilBERT and two toxicity detection models, namely two versions of +Toxic-BERT. Our findings indicate that all of these models exhibit +statistically significant explicit bias against PWD. + +
+
+ comment: TrustNLP at ACL 2023 +
+
+
+
+
+ + ☆ Context-Conditional Navigation with a Learning-Based Terrain- and + Robot-Aware Dynamics Model + + +
+ In autonomous navigation settings, several quantities can be subject to +variations. Terrain properties such as friction coefficients may vary over time +depending on the location of the robot. Also, the dynamics of the robot may +change due to, e.g., different payloads, changing the system's mass, or wear +and tear, changing actuator gains or joint friction. An autonomous agent should +thus be able to adapt to such variations. In this paper, we develop a novel +probabilistic, terrain- and robot-aware forward dynamics model, termed TRADYN, +which is able to adapt to the above-mentioned variations. It builds on recent +advances in meta-learning forward dynamics models based on Neural Processes. We +evaluate our method in a simulated 2D navigation setting with a unicycle-like +robot and different terrain layouts with spatially varying friction +coefficients. In our experiments, the proposed model exhibits lower prediction +error for the task of long-horizon trajectory prediction, compared to +non-adaptive ablation models. We also evaluate our model on the downstream task +of navigation planning, which demonstrates improved performance in planning +control-efficient paths by taking robot and terrain properties into account. + +
+
+ comment: \copyright 2023 IEEE. To be presented at the 2023 European Conference + on Mobile Robots (ECMR) +
+
+
+
+
+ + ☆ Learning Dynamic Attribute-factored World Models for Efficient + Multi-object Reinforcement Learning + + +
+ In many reinforcement learning tasks, the agent has to learn to interact with +many objects of different types and generalize to unseen combinations and +numbers of objects. Often a task is a composition of previously learned tasks +(e.g. block stacking). These are examples of compositional generalization, in +which we compose object-centric representations to solve complex tasks. Recent +works have shown the benefits of object-factored representations and +hierarchical abstractions for improving sample efficiency in these settings. On +the other hand, these methods do not fully exploit the benefits of +factorization in terms of object attributes. In this paper, we address this +opportunity and introduce the Dynamic Attribute FacTored RL (DAFT-RL) +framework. In DAFT-RL, we leverage object-centric representation learning to +extract objects from visual inputs. We learn to classify them in classes and +infer their latent parameters. For each class of object, we learn a class +template graph that describes how the dynamics and reward of an object of this +class factorize according to its attributes. We also learn an interaction +pattern graph that describes how objects of different classes interact with +each other at the attribute level. Through these graphs and a dynamic +interaction graph that models the interactions between objects, we can learn a +policy that can then be directly applied in a new environment by just +estimating the interactions and latent parameters. We evaluate DAFT-RL in three +benchmark datasets and show our framework outperforms the state-of-the-art in +generalizing across unseen objects with varying attributes and latent +parameters, as well as in the composition of previously learned tasks. + +
+
+
+
+
+ + ☆ Federated Learning for Computationally-Constrained Heterogeneous + Devices: A Survey + + +
+ With an increasing number of smart devices like internet of things (IoT) +devices deployed in the field, offloadingtraining of neural networks (NNs) to a +central server becomes more and more infeasible. Recent efforts toimprove +users' privacy have led to on-device learning emerging as an alternative. +However, a model trainedonly on a single device, using only local data, is +unlikely to reach a high accuracy. Federated learning (FL)has been introduced +as a solution, offering a privacy-preserving trade-off between communication +overheadand model accuracy by sharing knowledge between devices but disclosing +the devices' private data. Theapplicability and the benefit of applying +baseline FL are, however, limited in many relevant use cases dueto the +heterogeneity present in such environments. In this survey, we outline the +heterogeneity challengesFL has to overcome to be widely applicable in +real-world applications. We especially focus on the aspect ofcomputation +heterogeneity among the participating devices and provide a comprehensive +overview of recentworks on heterogeneity-aware FL. We discuss two groups: works +that adapt the NN architecture and worksthat approach heterogeneity on a system +level, covering Federated Averaging (FedAvg), distillation, and +splitlearning-based approaches, as well as synchronous and asynchronous +aggregation schemes. + +
+
+
+
+
+ + ☆ Towards Trustworthy Dataset Distillation + + +
+ Efficiency and trustworthiness are two eternal pursuits when applying deep +learning in real-world applications. With regard to efficiency, dataset +distillation (DD) endeavors to reduce training costs by distilling the large +dataset into a tiny synthetic dataset. However, existing methods merely +concentrate on in-distribution (InD) classification in a closed-world setting, +disregarding out-of-distribution (OOD) samples. On the other hand, OOD +detection aims to enhance models' trustworthiness, which is always +inefficiently achieved in full-data settings. For the first time, we +simultaneously consider both issues and propose a novel paradigm called +Trustworthy Dataset Distillation (TrustDD). By distilling both InD samples and +outliers, the condensed datasets are capable to train models competent in both +InD classification and OOD detection. To alleviate the requirement of real +outlier data and make OOD detection more practical, we further propose to +corrupt InD samples to generate pseudo-outliers and introduce Pseudo-Outlier +Exposure (POE). Comprehensive experiments on various settings demonstrate the +effectiveness of TrustDD, and the proposed POE surpasses state-of-the-art +method Outlier Exposure (OE). Compared with the preceding DD, TrustDD is more +trustworthy and applicable to real open-world scenarios. Our code will be +publicly available. + +
+
+ comment: 20 pages, 20 figures +
+
+
+
+
+ + ☆ MVA2023 Small Object Detection Challenge for Spotting Birds: Dataset, + Methods, and Results + + +
+ Small Object Detection (SOD) is an important machine vision topic because (i) +a variety of real-world applications require object detection for distant +objects and (ii) SOD is a challenging task due to the noisy, blurred, and +less-informative image appearances of small objects. This paper proposes a new +SOD dataset consisting of 39,070 images including 137,121 bird instances, which +is called the Small Object Detection for Spotting Birds (SOD4SB) dataset. The +detail of the challenge with the SOD4SB dataset is introduced in this paper. In +total, 223 participants joined this challenge. This paper briefly introduces +the award-winning methods. The dataset, the baseline code, and the website for +evaluation on the public testset are publicly available. + +
+
+ comment: This paper is included in the proceedings of the 18th International + Conference on Machine Vision Applications (MVA2023). It will be officially + published at a later date. Project page : + https://www.mva-org.jp/mva2023/challenge +
+
+
+
+
+ + ☆ Characterization of partial wetting by CMAS droplets using multiphase + many-body dissipative particle dynamics and data-driven discovery based on + PINNs + + +
+ The molten sand, a mixture of calcia, magnesia, alumina, and silicate, known +as CMAS, is characterized by its high viscosity, density, and surface tension. +The unique properties of CMAS make it a challenging material to deal with in +high-temperature applications, requiring innovative solutions and materials to +prevent its buildup and damage to critical equipment. Here, we use multiphase +many-body dissipative particle dynamics (mDPD) simulations to study the wetting +dynamics of highly viscous molten CMAS droplets. The simulations are performed +in three dimensions, with varying initial droplet sizes and equilibrium contact +angles. We propose a coarse parametric ordinary differential equation (ODE) +that captures the spreading radius behavior of the CMAS droplets. The ODE +parameters are then identified based on the Physics-Informed Neural Network +(PINN) framework. Subsequently, the closed form dependency of parameter values +found by PINN on the initial radii and contact angles are given using symbolic +regression. Finally, we employ Bayesian PINNs (B-PINNs) to assess and quantify +the uncertainty associated with the discovered parameters. In brief, this study +provides insight into spreading dynamics of CMAS droplets by fusing simple +parametric ODE modeling and state-of-the-art machine learning techniques. + +
+
+
+
+
+ + ☆ Mining of Single-Class by Active Learning for Semantic Segmentation + + +
+ Several Active Learning (AL) policies require retraining a target model +several times in order to identify the most informative samples and rarely +offer the option to focus on the acquisition of samples from underrepresented +classes. Here the Mining of Single-Class by Active Learning (MiSiCAL) paradigm +is introduced where an AL policy is constructed through deep reinforcement +learning and exploits quantity-accuracy correlations to build datasets on which +high-performance models can be trained with regards to specific classes. +MiSiCAL is especially helpful in the case of very large batch sizes since it +does not require repeated model training sessions as is common in other AL +methods. This is thanks to its ability to exploit fixed representations of the +candidate data points. We find that MiSiCAL is able to outperform a random +policy on 150 out of 171 COCO10k classes, while the strongest baseline only +outperforms random on 101 classes. + +
+
+ comment: 29 pages, 14 figures, 2 tables +
+
+
+
+
+ + ☆ Non-stationary Delayed Combinatorial Semi-Bandit with Causally Related + Rewards + + +
+ Sequential decision-making under uncertainty is often associated with long +feedback delays. Such delays degrade the performance of the learning agent in +identifying a subset of arms with the optimal collective reward in the long +run. This problem becomes significantly challenging in a non-stationary +environment with structural dependencies amongst the reward distributions +associated with the arms. Therefore, besides adapting to delays and +environmental changes, learning the causal relations alleviates the adverse +effects of feedback delay on the decision-making process. We formalize the +described setting as a non-stationary and delayed combinatorial semi-bandit +problem with causally related rewards. We model the causal relations by a +directed graph in a stationary structural equation model. The agent maximizes +the long-term average payoff, defined as a linear function of the base arms' +rewards. We develop a policy that learns the structural dependencies from +delayed feedback and utilizes that to optimize the decision-making while +adapting to drifts. We prove a regret bound for the performance of the proposed +algorithm. Besides, we evaluate our method via numerical analysis using +synthetic and real-world datasets to detect the regions that contribute the +most to the spread of Covid-19 in Italy. + +
+
+ comment: 33 pages, 9 figures. arXiv admin note: text overlap with + arXiv:2212.12923 +
+
+
+
+
+ + ☆ A Federated learning model for Electric Energy management using + Blockchain Technology + + +
+ Energy shortfall and electricity load shedding are the main problems for +developing countries. The main causes are lack of management in the energy +sector and the use of non-renewable energy sources. The improved energy +management and use of renewable sources can be significant to resolve energy +crisis. It is necessary to increase the use of renewable energy sources (RESs) +to meet the increasing energy demand due to high prices of fossil-fuel based +energy. Federated learning (FL) is the most emerging technique in the field of +artificial intelligence. Federated learning helps to generate global model at +server side by ensemble locally trained models at remote edges sites while +preserving data privacy. The global model used to predict energy demand to +satisfy the needs of consumers. In this article, we have proposed Blockchain +based safe distributed ledger technology for transaction of data between +prosumer and consumer to ensure their transparency, traceability and security. +Furthermore, we have also proposed a Federated learning model to forecast the +energy requirements of consumer and prosumer. Moreover, Blockchain has been +used to store excess energy data from prosumer for better management of energy +between prosumer and grid. Lastly, the experiment results revealed that +renewable energy sources have produced better and comparable results to other +non-renewable energy resources. + +
+
+ comment: 14 figures, 7 tables, 15 pages +
+
+
+
+
+ + ☆ DiTTO: Diffusion-inspired Temporal Transformer Operator + + +
+ Solving partial differential equations (PDEs) using a data-driven approach +has become increasingly common. The recent development of the operator learning +paradigm has enabled the solution of a broader range of PDE-related problems. +We propose an operator learning method to solve time-dependent PDEs +continuously in time without needing any temporal discretization. The proposed +approach, named DiTTO, is inspired by latent diffusion models. While diffusion +models are usually used in generative artificial intelligence tasks, their +time-conditioning mechanism is extremely useful for PDEs. The +diffusion-inspired framework is combined with elements from the Transformer +architecture to improve its capabilities. + We demonstrate the effectiveness of the new approach on a wide variety of +PDEs in multiple dimensions, namely the 1-D Burgers' equation, 2-D +Navier-Stokes equations, and the acoustic wave equation in 2-D and 3-D. DiTTO +achieves state-of-the-art results in terms of accuracy for these problems. We +also present a method to improve the performance of DiTTO by using fast +sampling concepts from diffusion models. Finally, we show that DiTTO can +accurately perform zero-shot super-resolution in time. + +
+
+
+
+
+ + ☆ Evaluate Fine-tuning Strategies for Fetal Head Ultrasound Image + Segmentation with U-Net + + +
+ Fetal head segmentation is a crucial step in measuring the fetal head +circumference (HC) during gestation, an important biometric in obstetrics for +monitoring fetal growth. However, manual biometry generation is time-consuming +and results in inconsistent accuracy. To address this issue, convolutional +neural network (CNN) models have been utilized to improve the efficiency of +medical biometry. But training a CNN network from scratch is a challenging +task, we proposed a Transfer Learning (TL) method. Our approach involves +fine-tuning (FT) a U-Net network with a lightweight MobileNet as the encoder to +perform segmentation on a set of fetal head ultrasound (US) images with limited +effort. This method addresses the challenges associated with training a CNN +network from scratch. It suggests that our proposed FT strategy yields +segmentation performance that is comparable when trained with a reduced number +of parameters by 85.8%. And our proposed FT strategy outperforms other +strategies with smaller trainable parameter sizes below 4.4 million. Thus, we +contend that it can serve as a dependable FT approach for reducing the size of +models in medical image analysis. Our key findings highlight the importance of +the balance between model performance and size in developing Artificial +Intelligence (AI) applications by TL methods. Code is available at +https://github.com/13204942/FT_Methods_for_Fetal_Head_Segmentation. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ Learning Adaptive Neighborhoods for Graph Neural Networks ICCV 2023 + + +
+ Graph convolutional networks (GCNs) enable end-to-end learning on graph +structured data. However, many works assume a given graph structure. When the +input graph is noisy or unavailable, one approach is to construct or learn a +latent graph structure. These methods typically fix the choice of node degree +for the entire graph, which is suboptimal. Instead, we propose a novel +end-to-end differentiable graph generator which builds graph topologies where +each node selects both its neighborhood and its size. Our module can be readily +integrated into existing pipelines involving graph convolution operations, +replacing the predetermined or existing adjacency matrix with one that is +learned, and optimized, as part of the general objective. As such it is +applicable to any GCN. We integrate our module into trajectory prediction, +point cloud classification and node classification pipelines resulting in +improved accuracy over other structure-learning methods across a wide range of +datasets and GCN backbones. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Extreme heatwave sampling and prediction with analog Markov chain and + comparisons with deep learning + + +
+ We present a data-driven emulator, stochastic weather generator (SWG), +suitable for estimating probabilities of prolonged heatwaves in France and +Scandinavia. This emulator is based on the method of analogs of circulation to +which we add temperature and soil moisture as predictor fields. We train the +emulator on an intermediate complexity climate model run and show that it is +capable of predicting conditional probabilities (forecasting) of heatwaves out +of sample. Special attention is payed that this prediction is evaluated using +proper score appropriate for rare events. To accelerate the computation of +analogs dimensionality reduction techniques are applied and the performance is +evaluated. The probabilistic prediction achieved with SWG is compared with the +one achieved with + Convolutional Neural Network (CNN). With the availability of hundreds of +years of training data CNNs perform better at the task of probabilistic +prediction. In addition, we show that the SWG emulator trained on 80 years of +data is capable of estimating extreme return times of order of thousands of +years for heatwaves longer than several days more precisely than the fit based +on generalised extreme value distribution. Finally, the quality of its +synthetic extreme teleconnection patterns obtained with stochastic weather +generator is studied. We showcase two examples of such synthetic teleconnection +patterns for heatwaves in France and Scandinavia that compare favorably to the +very long climate model control run. + +
+
+ comment: 29 pages, 13 figures, presented at Climate Informatics 2023, UK + Cambridge +
+
+
+
+
+ + ☆ Globally solving the Gromov-Wasserstein problem for point clouds in low + dimensional Euclidean spaces + + +
+ This paper presents a framework for computing the Gromov-Wasserstein problem +between two sets of points in low dimensional spaces, where the discrepancy is +the squared Euclidean norm. The Gromov-Wasserstein problem is a generalization +of the optimal transport problem that finds the assignment between two sets +preserving pairwise distances as much as possible. This can be used to quantify +the similarity between two formations or shapes, a common problem in AI and +machine learning. The problem can be formulated as a Quadratic Assignment +Problem (QAP), which is in general computationally intractable even for small +problems. Our framework addresses this challenge by reformulating the QAP as an +optimization problem with a low-dimensional domain, leveraging the fact that +the problem can be expressed as a concave quadratic optimization problem with +low rank. The method scales well with the number of points, and it can be used +to find the global solution for large-scale problems with thousands of points. +We compare the computational complexity of our approach with state-of-the-art +methods on synthetic problems and apply it to a near-symmetrical problem which +is of particular interest in computational biology. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ Outlier-Robust Tensor Low-Rank Representation for Data Clustering + + +
+ Low-rank tensor analysis has received widespread attention with many +practical applications. However, the tensor data are often contaminated by +outliers or sample-specific corruptions. How to recover the tensor data that +are corrupted by outliers and perform data clustering remains a challenging +problem. This paper develops an outlier-robust tensor low-rank representation +(OR-TLRR) method for simultaneous outlier detection and tensor data clustering +based on the tensor singular value decomposition (t-SVD) algebraic framework. +It is motivated by the recently proposed tensor-tensor product induced by +invertible linear transforms that satisfy certain conditions. For tensor +observations with arbitrary outlier corruptions, OR-TLRR has provable +performance guarantee for exactly recovering the row space of clean data and +detecting outliers under mild conditions. Moreover, an extension of OR-TLRR is +also proposed to handle the case when parts of the data are missing. Finally, +extensive experimental results on both synthetic and real data demonstrate the +effectiveness of the proposed algorithms. + +
+
+ comment: 12 pages, 1 figure; preprint of a journal paper +
+
+
+
+
+ + ☆ qecGPT: decoding Quantum Error-correcting Codes with Generative + Pre-trained Transformers + + +
+ We propose a general framework for decoding quantum error-correcting codes +with generative modeling. The model utilizes autoregressive neural networks, +specifically Transformers, to learn the joint probability of logical operators +and syndromes. This training is in an unsupervised way, without the need for +labeled training data, and is thus referred to as pre-training. After the +pre-training, the model can efficiently compute the likelihood of logical +operators for any given syndrome, using maximum likelihood decoding. It can +directly generate the most-likely logical operators with computational +complexity $\mathcal O(2k)$ in the number of logical qubits $k$, which is +significantly better than the conventional maximum likelihood decoding +algorithms that require $\mathcal O(4^k)$ computation. Based on the pre-trained +model, we further propose refinement to achieve more accurately the likelihood +of logical operators for a given syndrome by directly sampling the stabilizer +operators. We perform numerical experiments on stabilizer codes with small code +distances, using both depolarizing error models and error models with +correlated noise. The results show that our approach provides significantly +better decoding accuracy than the minimum weight perfect matching and +belief-propagation-based algorithms. Our framework is general and can be +applied to any error model and quantum codes with different topologies such as +surface codes and quantum LDPC codes. Furthermore, it leverages the +parallelization capabilities of GPUs, enabling simultaneous decoding of a large +number of syndromes. Our approach sheds light on the efficient and accurate +decoding of quantum error-correcting codes using generative artificial +intelligence and modern computational power. + +
+
+ comment: Comments are welcome +
+
+
+
+
+ + ☆ U-shaped Transformer: Retain High Frequency Context in Time Series + Analysis + + +
+ Time series prediction plays a crucial role in various industrial fields. In +recent years, neural networks with a transformer backbone have achieved +remarkable success in many domains, including computer vision and NLP. In time +series analysis domain, some studies have suggested that even the simplest MLP +networks outperform advanced transformer-based networks on time series forecast +tasks. However, we believe these findings indicate there to be low-rank +properties in time series sequences. In this paper, we consider the low-pass +characteristics of transformers and try to incorporate the advantages of MLP. +We adopt skip-layer connections inspired by Unet into traditional transformer +backbone, thus preserving high-frequency context from input to output, namely +U-shaped Transformer. We introduce patch merge and split operation to extract +features with different scales and use larger datasets to fully make use of the +transformer backbone. Our experiments demonstrate that the model performs at an +advanced level across multiple datasets with relatively low cost. + +
+
+
+
+
+ + ☆ Multimodal LLMs for health grounded in individual-specific data + + +
+ Foundation large language models (LLMs) have shown an impressive ability to +solve tasks across a wide range of fields including health. To effectively +solve personalized health tasks, LLMs need the ability to ingest a diversity of +data modalities that are relevant to an individual's health status. In this +paper, we take a step towards creating multimodal LLMs for health that are +grounded in individual-specific data by developing a framework (HeLM: Health +Large Language Model for Multimodal Understanding) that enables LLMs to use +high-dimensional clinical modalities to estimate underlying disease risk. HeLM +encodes complex data modalities by learning an encoder that maps them into the +LLM's token embedding space and for simple modalities like tabular data by +serializing the data into text. Using data from the UK Biobank, we show that +HeLM can effectively use demographic and clinical features in addition to +high-dimensional time-series data to estimate disease risk. For example, HeLM +achieves an AUROC of 0.75 for asthma prediction when combining tabular and +spirogram data modalities compared with 0.49 when only using tabular data. +Overall, we find that HeLM outperforms or performs at parity with classical +machine learning approaches across a selection of eight binary traits. +Furthermore, we investigate the downstream uses of this model such as its +generalizability to out-of-distribution traits and its ability to power +conversations around individual health and wellness. + +
+
+
+
+
+ + ☆ How is ChatGPT's behavior changing over time? + + +
+ GPT-3.5 and GPT-4 are the two most widely used large language model (LLM) +services. However, when and how these models are updated over time is opaque. +Here, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on +four diverse tasks: 1) solving math problems, 2) answering sensitive/dangerous +questions, 3) generating code and 4) visual reasoning. We find that the +performance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time. +For example, GPT-4 (March 2023) was very good at identifying prime numbers +(accuracy 97.6%) but GPT-4 (June 2023) was very poor on these same questions +(accuracy 2.4%). Interestingly GPT-3.5 (June 2023) was much better than GPT-3.5 +(March 2023) in this task. GPT-4 was less willing to answer sensitive questions +in June than in March, and both GPT-4 and GPT-3.5 had more formatting mistakes +in code generation in June than in March. Overall, our findings shows that the +behavior of the same LLM service can change substantially in a relatively short +amount of time, highlighting the need for continuous monitoring of LLM quality. + +
+
+
+
+
+ + ☆ OxfordVGG Submission to the EGO4D AV Transcription Challenge + + +
+ This report presents the technical details of our submission on the EGO4D +Audio-Visual (AV) Automatic Speech Recognition Challenge 2023 from the +OxfordVGG team. We present WhisperX, a system for efficient speech +transcription of long-form audio with word-level time alignment, along with two +text normalisers which are publicly available. Our final submission obtained +56.0% of the Word Error Rate (WER) on the challenge test set, ranked 1st on the +leaderboard. All baseline codes and models are available on +https://github.com/m-bain/whisperX. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Oracle Efficient Online Multicalibration and Omniprediction + + +
+ A recent line of work has shown a surprising connection between +multicalibration, a multi-group fairness notion, and omniprediction, a learning +paradigm that provides simultaneous loss minimization guarantees for a large +family of loss functions. Prior work studies omniprediction in the batch +setting. We initiate the study of omniprediction in the online adversarial +setting. Although there exist algorithms for obtaining notions of +multicalibration in the online adversarial setting, unlike batch algorithms, +they work only for small finite classes of benchmark functions $F$, because +they require enumerating every function $f \in F$ at every round. In contrast, +omniprediction is most interesting for learning theoretic hypothesis classes +$F$, which are generally continuously large. + We develop a new online multicalibration algorithm that is well defined for +infinite benchmark classes $F$, and is oracle efficient (i.e. for any class +$F$, the algorithm has the form of an efficient reduction to a no-regret +learning algorithm for $F$). The result is the first efficient online +omnipredictor -- an oracle efficient prediction algorithm that can be used to +simultaneously obtain no regret guarantees to all Lipschitz convex loss +functions. For the class $F$ of linear functions, we show how to make our +algorithm efficient in the worst case. Also, we show upper and lower bounds on +the extent to which our rates can be improved: our oracle efficient algorithm +actually promises a stronger guarantee called swap-omniprediction, and we prove +a lower bound showing that obtaining $O(\sqrt{T})$ bounds for +swap-omniprediction is impossible in the online setting. On the other hand, we +give a (non-oracle efficient) algorithm which can obtain the optimal +$O(\sqrt{T})$ omniprediction bounds without going through multicalibration, +giving an information theoretic separation between these two solution concepts. + +
+
+
+
+
+ + ☆ GraphCL-DTA: a graph contrastive learning with molecular semantics for + drug-target binding affinity prediction + + +
+ Drug-target binding affinity prediction plays an important role in the early +stages of drug discovery, which can infer the strength of interactions between +new drugs and new targets. However, the performance of previous computational +models is limited by the following drawbacks. The learning of drug +representation relies only on supervised data, without taking into account the +information contained in the molecular graph itself. Moreover, most previous +studies tended to design complicated representation learning module, while +uniformity, which is used to measure representation quality, is ignored. In +this study, we propose GraphCL-DTA, a graph contrastive learning with molecular +semantics for drug-target binding affinity prediction. In GraphCL-DTA, we +design a graph contrastive learning framework for molecular graphs to learn +drug representations, so that the semantics of molecular graphs are preserved. +Through this graph contrastive framework, a more essential and effective drug +representation can be learned without additional supervised data. Next, we +design a new loss function that can be directly used to smoothly adjust the +uniformity of drug and target representations. By directly optimizing the +uniformity of representations, the representation quality of drugs and targets +can be improved. The effectiveness of the above innovative elements is verified +on two real datasets, KIBA and Davis. The excellent performance of GraphCL-DTA +on the above datasets suggests its superiority to the state-of-the-art model. + +
+
+ comment: 13 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Neural Network Pruning as Spectrum Preserving Process + + +
+ Neural networks have achieved remarkable performance in various application +domains. Nevertheless, a large number of weights in pre-trained deep neural +networks prohibit them from being deployed on smartphones and embedded systems. +It is highly desirable to obtain lightweight versions of neural networks for +inference in edge devices. Many cost-effective approaches were proposed to +prune dense and convolutional layers that are common in deep neural networks +and dominant in the parameter space. However, a unified theoretical foundation +for the problem mostly is missing. In this paper, we identify the close +connection between matrix spectrum learning and neural network training for +dense and convolutional layers and argue that weight pruning is essentially a +matrix sparsification process to preserve the spectrum. Based on the analysis, +we also propose a matrix sparsification algorithm tailored for neural network +pruning that yields better pruning result. We carefully design and conduct +experiments to support our arguments. Hence we provide a consolidated viewpoint +for neural network pruning and enhance the interpretability of deep neural +networks by identifying and preserving the critical neural weights. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2304.03452 +
+
+
+
+
+ + ☆ A Unifying Framework for Differentially Private Sums under Continual + Observation + + +
+ We study the problem of maintaining a differentially private decaying sum +under continual observation. We give a unifying framework and an efficient +algorithm for this problem for \emph{any sufficiently smooth} function. Our +algorithm is the first differentially private algorithm that does not have a +multiplicative error for polynomially-decaying weights. Our algorithm improves +on all prior works on differentially private decaying sums under continual +observation and recovers exactly the additive error for the special case of +continual counting from Henzinger et al. (SODA 2023) as a corollary. + Our algorithm is a variant of the factorization mechanism whose error depends +on the $\gamma_2$ and $\gamma_F$ norm of the underlying matrix. We give a +constructive proof for an almost exact upper bound on the $\gamma_2$ and +$\gamma_F$ norm and an almost tight lower bound on the $\gamma_2$ norm for a +large class of lower-triangular matrices. This is the first non-trivial lower +bound for lower-triangular matrices whose non-zero entries are not all the +same. It includes matrices for all continual decaying sums problems, resulting +in an upper bound on the additive error of any differentially private decaying +sums algorithm under continual observation. + We also explore some implications of our result in discrepancy theory and +operator algebra. Given the importance of the $\gamma_2$ norm in computer +science and the extensive work in mathematics, we believe our result will have +further applications. + +
+
+ comment: 32 pages +
+
+
+
+
+ + ☆ Landscape Surrogate: Learning Decision Losses for Mathematical + Optimization Under Partial Information + + +
+ Recent works in learning-integrated optimization have shown promise in +settings where the optimization problem is only partially observed or where +general-purpose optimizers perform poorly without expert tuning. By learning an +optimizer $\mathbf{g}$ to tackle these challenging problems with $f$ as the +objective, the optimization process can be substantially accelerated by +leveraging past experience. The optimizer can be trained with supervision from +known optimal solutions or implicitly by optimizing the compound function +$f\circ \mathbf{g}$. The implicit approach may not require optimal solutions as +labels and is capable of handling problem uncertainty; however, it is slow to +train and deploy due to frequent calls to optimizer $\mathbf{g}$ during both +training and testing. The training is further challenged by sparse gradients of +$\mathbf{g}$, especially for combinatorial solvers. To address these +challenges, we propose using a smooth and learnable Landscape Surrogate $M$ as +a replacement for $f\circ \mathbf{g}$. This surrogate, learnable by neural +networks, can be computed faster than the solver $\mathbf{g}$, provides dense +and smooth gradients during training, can generalize to unseen optimization +problems, and is efficiently learned via alternating optimization. We test our +approach on both synthetic problems, including shortest path and +multidimensional knapsack, and real-world problems such as portfolio +optimization, achieving comparable or superior objective values compared to +state-of-the-art baselines while reducing the number of calls to $\mathbf{g}$. +Notably, our approach outperforms existing methods for computationally +expensive high-dimensional problems. + +
+
+
+
+
+ + ☆ REX: Rapid Exploration and eXploitation for AI Agents + + +
+ In this paper, we propose an enhanced approach for Rapid Exploration and +eXploitation for AI Agents called REX. Existing AutoGPT-style techniques have +inherent limitations, such as a heavy reliance on precise descriptions for +decision-making, and the lack of a systematic approach to leverage try-and-fail +procedures akin to traditional Reinforcement Learning (RL). REX introduces an +additional layer of rewards and integrates concepts similar to Upper Confidence +Bound (UCB) scores, leading to more robust and efficient AI agent performance. +This approach has the advantage of enabling the utilization of offline +behaviors from logs and allowing seamless integration with existing foundation +models while it does not require any model fine-tuning. Through comparative +analysis with existing methods such as Chain-of-Thoughts(CoT) and Reasoning viA +Planning(RAP), REX-based methods demonstrate comparable performance and, in +certain cases, even surpass the results achieved by these existing techniques. +Notably, REX-based methods exhibit remarkable reductions in execution time, +enhancing their practical applicability across a diverse set of scenarios. + +
+
+
+
+
+ + ☆ Discretization-based ensemble model for robust learning in IoT + + +
+ IoT device identification is the process of recognizing and verifying +connected IoT devices to the network. This is an essential process for ensuring +that only authorized devices can access the network, and it is necessary for +network management and maintenance. In recent years, machine learning models +have been used widely for automating the process of identifying devices in the +network. However, these models are vulnerable to adversarial attacks that can +compromise their accuracy and effectiveness. To better secure device +identification models, discretization techniques enable reduction in the +sensitivity of machine learning models to adversarial attacks contributing to +the stability and reliability of the model. On the other hand, Ensemble methods +combine multiple heterogeneous models to reduce the impact of remaining noise +or errors in the model. Therefore, in this paper, we integrate discretization +techniques and ensemble methods and examine it on model robustness against +adversarial attacks. In other words, we propose a discretization-based ensemble +stacking technique to improve the security of our ML models. We evaluate the +performance of different ML-based IoT device identification models against +white box and black box attacks using a real-world dataset comprised of network +traffic from 28 IoT devices. We demonstrate that the proposed method enables +robustness to the models for IoT device identification. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Knowledge-infused Deep Learning Enables Interpretable Landslide + Forecasting + + +
+ Forecasting how landslides will evolve over time or whether they will fail is +a challenging task due to a variety of factors, both internal and external. +Despite their considerable potential to address these challenges, deep learning +techniques lack interpretability, undermining the credibility of the forecasts +they produce. The recent development of transformer-based deep learning offers +untapped possibilities for forecasting landslides with unprecedented +interpretability and nonlinear feature learning capabilities. Here, we present +a deep learning pipeline that is capable of predicting landslide behavior +holistically, which employs a transformer-based network called LFIT to learn +complex nonlinear relationships from prior knowledge and multiple source data, +identifying the most relevant variables, and demonstrating a comprehensive +understanding of landslide evolution and temporal patterns. By integrating +prior knowledge, we provide improvement in holistic landslide forecasting, +enabling us to capture diverse responses to various influencing factors in +different local landslide areas. Using deformation observations as proxies for +measuring the kinetics of landslides, we validate our approach by training +models to forecast reservoir landslides in the Three Gorges Reservoir and +creeping landslides on the Tibetan Plateau. When prior knowledge is +incorporated, we show that interpretable landslide forecasting effectively +identifies influential factors across various landslides. It further elucidates +how local areas respond to these factors, making landslide behavior and trends +more interpretable and predictable. The findings from this study will +contribute to understanding landslide behavior in a new way and make the +proposed approach applicable to other complex disasters influenced by internal +and external factors in the future. + +
+
+
+
+
+ + ☆ Alioth: A Machine Learning Based Interference-Aware Performance Monitor + for Multi-Tenancy Applications in Public Cloud + + +
+ Multi-tenancy in public clouds may lead to co-location interference on shared +resources, which possibly results in performance degradation of cloud +applications. Cloud providers want to know when such events happen and how +serious the degradation is, to perform interference-aware migrations and +alleviate the problem. However, virtual machines (VM) in +Infrastructure-as-a-Service public clouds are black-boxes to providers, where +application-level performance information cannot be acquired. This makes +performance monitoring intensely challenging as cloud providers can only rely +on low-level metrics such as CPU usage and hardware counters. + We propose a novel machine learning framework, Alioth, to monitor the +performance degradation of cloud applications. To feed the data-hungry models, +we first elaborate interference generators and conduct comprehensive +co-location experiments on a testbed to build Alioth-dataset which reflects the +complexity and dynamicity in real-world scenarios. Then we construct Alioth by +(1) augmenting features via recovering low-level metrics under no interference +using denoising auto-encoders, (2) devising a transfer learning model based on +domain adaptation neural network to make models generalize on test cases unseen +in offline training, and (3) developing a SHAP explainer to automate feature +selection and enhance model interpretability. Experiments show that Alioth +achieves an average mean absolute error of 5.29% offline and 10.8% when testing +on applications unseen in the training stage, outperforming the baseline +methods. Alioth is also robust in signaling quality-of-service violation under +dynamicity. Finally, we demonstrate a possible application of Alioth's +interpretability, providing insights to benefit the decision-making of cloud +operators. The dataset and code of Alioth have been released on GitHub. + +
+
+ comment: Accepted by 2023 IEEE International Parallel & Distributed Processing + Symposium (IPDPS) +
+
+
+
+
+ + ☆ Mitigating Label Bias via Decoupled Confident Learning ICML + + +
+ Growing concerns regarding algorithmic fairness have led to a surge in +methodologies to mitigate algorithmic bias. However, such methodologies largely +assume that observed labels in training data are correct. This is problematic +because bias in labels is pervasive across important domains, including +healthcare, hiring, and content moderation. In particular, human-generated +labels are prone to encoding societal biases. While the presence of labeling +bias has been discussed conceptually, there is a lack of methodologies to +address this problem. We propose a pruning method -- Decoupled Confident +Learning (DeCoLe) -- specifically designed to mitigate label bias. After +illustrating its performance on a synthetic dataset, we apply DeCoLe in the +context of hate speech detection, where label bias has been recognized as an +important challenge, and show that it successfully identifies biased labels and +outperforms competing approaches. + +
+
+ comment: AI & HCI Workshop at the 40th International Conference on Machine + Learning (ICML), Honolulu, Hawaii, USA. 2023 +
+
+
+
+
+ + ☆ Siamese Networks for Weakly Supervised Human Activity Recognition + + +
+ Deep learning has been successfully applied to human activity recognition. +However, training deep neural networks requires explicitly labeled data which +is difficult to acquire. In this paper, we present a model with multiple +siamese networks that are trained by using only the information about the +similarity between pairs of data samples without knowing the explicit labels. +The trained model maps the activity data samples into fixed size representation +vectors such that the distance between the vectors in the representation space +approximates the similarity of the data samples in the input space. Thus, the +trained model can work as a metric for a wide range of different clustering +algorithms. The training process minimizes a similarity loss function that +forces the distance metric to be small for pairs of samples from the same kind +of activity, and large for pairs of samples from different kinds of activities. +We evaluate the model on three datasets to verify its effectiveness in +segmentation and recognition of continuous human activity sequences. + +
+
+
+
+
+ + ☆ NTK-approximating MLP Fusion for Efficient Language Model Fine-tuning ICML 2023 + + +
+ Fine-tuning a pre-trained language model (PLM) emerges as the predominant +strategy in many natural language processing applications. However, even +fine-tuning the PLMs and doing inference are expensive, especially on edge +devices with low computing power. Some general approaches (e.g. quantization +and distillation) have been widely studied to reduce the compute/memory of PLM +fine-tuning, while very few one-shot compression techniques are explored. In +this paper, we investigate the neural tangent kernel (NTK)--which reveals the +gradient descent dynamics of neural networks--of the multilayer perceptrons +(MLP) modules in a PLM and propose to coin a lightweight PLM through +NTK-approximating MLP fusion. To achieve this, we reconsider the MLP as a +bundle of sub-MLPs, and cluster them into a given number of centroids, which +can then be restored as a compressed MLP and surprisingly shown to well +approximate the NTK of the original PLM. Extensive experiments of PLM +fine-tuning on both natural language understanding (NLU) and generation (NLG) +tasks are provided to verify the effectiveness of the proposed method MLP +fusion. Our code is available at https://github.com/weitianxin/MLP_Fusion. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ☆ Experimental Security Analysis of DNN-based Adaptive Cruise Control + under Context-Aware Perception Attacks + + +
+ Adaptive Cruise Control (ACC) is a widely used driver assistance feature for +maintaining desired speed and safe distance to the leading vehicles. This paper +evaluates the security of the deep neural network (DNN) based ACC systems under +stealthy perception attacks that strategically inject perturbations into camera +data to cause forward collisions. We present a combined +knowledge-and-data-driven approach to design a context-aware strategy for the +selection of the most critical times for triggering the attacks and a novel +optimization-based method for the adaptive generation of image perturbations at +run-time. We evaluate the effectiveness of the proposed attack using an actual +driving dataset and a realistic simulation platform with the control software +from a production ACC system and a physical-world driving simulator while +considering interventions by the driver and safety features such as Automatic +Emergency Braking (AEB) and Forward Collision Warning (FCW). Experimental +results show that the proposed attack achieves 142.9x higher success rate in +causing accidents than random attacks and is mitigated 89.6% less by the safety +features while being stealthy and robust to real-world factors and dynamic +changes in the environment. This study provides insights into the role of human +operators and basic safety interventions in preventing attacks. + +
+
+ comment: 18 pages, 14 figures, 8 tables +
+
+
+
+
+ + ☆ Multi-stage Neural Networks: Function Approximator of Machine Precision + + +
+ Deep learning techniques are increasingly applied to scientific problems, +where the precision of networks is crucial. Despite being deemed as universal +function approximators, neural networks, in practice, struggle to reduce the +prediction errors below $O(10^{-5})$ even with large network size and extended +training iterations. To address this issue, we developed the multi-stage neural +networks that divides the training process into different stages, with each +stage using a new network that is optimized to fit the residue from the +previous stage. Across successive stages, the residue magnitudes decreases +substantially and follows an inverse power-law relationship with the residue +frequencies. The multi-stage neural networks effectively mitigate the spectral +biases associated with regular neural networks, enabling them to capture the +high frequency feature of target functions. We demonstrate that the prediction +error from the multi-stage training for both regression problems and +physics-informed neural networks can nearly reach the machine-precision +$O(10^{-16})$ of double-floating point within a finite number of iterations. +Such levels of accuracy are rarely attainable using single neural networks +alone. + +
+
+ comment: 38 pages, 17 pages +
+
+
+
+
+ + ☆ IxDRL: A Novel Explainable Deep Reinforcement Learning Toolkit based on + Analyses of Interestingness + + +
+ In recent years, advances in deep learning have resulted in a plethora of +successes in the use of reinforcement learning (RL) to solve complex sequential +decision tasks with high-dimensional inputs. However, existing systems lack the +necessary mechanisms to provide humans with a holistic view of their +competence, presenting an impediment to their adoption, particularly in +critical applications where the decisions an agent makes can have significant +consequences. Yet, existing RL-based systems are essentially competency-unaware +in that they lack the necessary interpretation mechanisms to allow human +operators to have an insightful, holistic view of their competency. Towards +more explainable Deep RL (xDRL), we propose a new framework based on analyses +of interestingness. Our tool provides various measures of RL agent competence +stemming from interestingness analysis and is applicable to a wide range of RL +algorithms, natively supporting the popular RLLib toolkit. We showcase the use +of our framework by applying the proposed pipeline in a set of scenarios of +varying complexity. We empirically assess the capability of the approach in +identifying agent behavior patterns and competency-controlling conditions, and +the task elements mostly responsible for an agent's competence, based on global +and local analyses of interestingness. Overall, we show that our framework can +provide agent designers with insights about RL agent competence, both their +capabilities and limitations, enabling more informed decisions about +interventions, additional training, and other interactions in collaborative +human-machine settings. + +
+
+ comment: To be published in the Proceedings of the 1st World Conference on + eXplainable Artificial Intelligence (xAI 2023). arXiv admin note: substantial + text overlap with arXiv:2211.06376 +
+
+
+
+
+ + ☆ On-the-fly machine learning for parametrization of the effective + Hamiltonian + + +
+ The first-principles-based effective Hamiltonian is widely used to predict +and simulate the properties of ferroelectrics and relaxor ferroelectrics. +However, the parametrization method of the effective Hamiltonian is complicated +and hardly can resolve the systems with complex interactions and/or complex +components. Here, we developed an on-the-fly machine learning approach to +parametrize the effective Hamiltonian based on Bayesian linear regression. The +parametrization is completed in molecular dynamics simulations, with the +energy, forces and stress predicted at each step along with their +uncertainties. First-principles calculations are executed when the +uncertainties are large to retrain the parameters. This approach provides a +universal and automatic way to compute the effective Hamiltonian parameters for +any considered systems including complex systems which previous methods can not +handle. BaTiO3 and Pb(Sc,Ta)O3 are taken as examples to show the accurateness +of this approach comparing with conventional first-principles parametrization +method. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ Federated Large Language Model: A Position Paper + + +
+ Large scale language models (LLM) have received significant attention and +found diverse applications across various domains, but their development +encounters challenges in real-world scenarios. These challenges arise due to +the scarcity of public domain data availability and the need to maintain +privacy with respect to private domain data. To address these issues, federated +learning (FL) has emerged as a promising technology that enables collaborative +training of shared models while preserving decentralized data. We propose the +concept of federated LLM, which comprises three key components, i.e., federated +LLM pre-training, federated LLM fine-tuning, and federated LLM prompt +engineering. For each component, we discuss its advantage over traditional LLM +training methods and propose specific engineering strategies for +implementation. Furthermore, we explore the novel challenges introduced by the +integration of FL and LLM. We analyze existing solutions and identify potential +obstacles faced by these solutions within the context of federated LLM. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Learning to Sample Tasks for Meta Learning + + +
+ Through experiments on various meta-learning methods, task samplers, and +few-shot learning tasks, this paper arrives at three conclusions. Firstly, +there are no universal task sampling strategies to guarantee the performance of +meta-learning models. Secondly, task diversity can cause the models to either +underfit or overfit during training. Lastly, the generalization performance of +the models are influenced by task divergence, task entropy, and task +difficulty. In response to these findings, we propose a novel task sampler +called Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes +task divergence, task entropy, and task difficulty to sample tasks. To optimize +ASr, we rethink and propose a simple and general meta-learning algorithm. +Finally, a large number of empirical experiments demonstrate the effectiveness +of the proposed ASr. + +
+
+ comment: 10 pages, 7 tables, 3 figures +
+
+
+
+
+ + ☆ Optimistic Estimate Uncovers the Potential of Nonlinear Models + + +
+ We propose an optimistic estimate to evaluate the best possible fitting +performance of nonlinear models. It yields an optimistic sample size that +quantifies the smallest possible sample size to fit/recover a target function +using a nonlinear model. We estimate the optimistic sample sizes for matrix +factorization models, deep models, and deep neural networks (DNNs) with +fully-connected or convolutional architecture. For each nonlinear model, our +estimates predict a specific subset of targets that can be fitted at +overparameterization, which are confirmed by our experiments. Our optimistic +estimate reveals two special properties of the DNN models -- free +expressiveness in width and costly expressiveness in connection. These +properties suggest the following architecture design principles of DNNs: (i) +feel free to add neurons/kernels; (ii) restrain from connecting neurons. +Overall, our optimistic estimate theoretically unveils the vast potential of +nonlinear models in fitting at overparameterization. Based on this framework, +we anticipate gaining a deeper understanding of how and why numerous nonlinear +models such as DNNs can effectively realize their potential in practice in the +near future. + +
+
+
+
+
+ + ☆ Continuous-Time Reinforcement Learning: New Design Algorithms with + Theoretical Insights and Performance Guarantees + + +
+ Continuous-time nonlinear optimal control problems hold great promise in +real-world applications. After decades of development, reinforcement learning +(RL) has achieved some of the greatest successes as a general nonlinear control +design method. However, a recent comprehensive analysis of state-of-the-art +continuous-time RL (CT-RL) methods, namely, adaptive dynamic programming +(ADP)-based CT-RL algorithms, reveals they face significant design challenges +due to their complexity, numerical conditioning, and dimensional scaling +issues. Despite advanced theoretical results, existing ADP CT-RL synthesis +methods are inadequate in solving even small, academic problems. The goal of +this work is thus to introduce a suite of new CT-RL algorithms for control of +affine nonlinear systems. Our design approach relies on two important factors. +First, our methods are applicable to physical systems that can be partitioned +into smaller subproblems. This constructive consideration results in reduced +dimensionality and greatly improved intuitiveness of design. Second, we +introduce a new excitation framework to improve persistence of excitation (PE) +and numerical conditioning performance via classical input/output insights. +Such a design-centric approach is the first of its kind in the ADP CT-RL +community. In this paper, we progressively introduce a suite of (decentralized) +excitable integral reinforcement learning (EIRL) algorithms. We provide +convergence and closed-loop stability guarantees, and we demonstrate these +guarantees on a significant application problem of controlling an unstable, +nonminimum phase hypersonic vehicle (HSV). + +
+
+
+
+
+ + ☆ Accuracy versus time frontiers of semi-supervised and self-supervised + learning on medical images + + +
+ For many applications of classifiers to medical images, a trustworthy label +for each image can be difficult or expensive to obtain. In contrast, images +without labels are more readily available. Two major research directions both +promise that additional unlabeled data can improve classifier performance: +self-supervised learning pretrains useful representations on unlabeled data +only, then fine-tunes a classifier on these representations via the labeled +set; semi-supervised learning directly trains a classifier on labeled and +unlabeled data simultaneously. Recent methods from both directions have claimed +significant gains on non-medical tasks, but do not systematically assess +medical images and mostly compare only to methods in the same direction. This +study contributes a carefully-designed benchmark to help answer a +practitioner's key question: given a small labeled dataset and a limited budget +of hours to spend on training, what gains from additional unlabeled images are +possible and which methods best achieve them? Unlike previous benchmarks, ours +uses realistic-sized validation sets to select hyperparameters, assesses +runtime-performance tradeoffs, and bridges two research fields. By comparing 6 +semi-supervised methods and 5 self-supervised methods to strong labeled-only +baselines on 3 medical datasets with 30-1000 labels per class, we offer +insights to resource-constrained, results-focused practitioners: MixMatch, +SimCLR, and BYOL represent strong choices that were not surpassed by more +recent methods. After much effort selecting hyperparameters on one dataset, we +publish settings that enable strong methods to perform well on new medical +tasks within a few hours, with further search over dozens of hours delivering +modest additional gains. + +
+
+ comment: Semi-supervised Learning; Self-supervised Learning; Medical Imaging +
+
+
+
+
+ + ☆ Towards the Sparseness of Projection Head in Self-Supervised Learning + + +
+ In recent years, self-supervised learning (SSL) has emerged as a promising +approach for extracting valuable representations from unlabeled data. One +successful SSL method is contrastive learning, which aims to bring positive +examples closer while pushing negative examples apart. Many current contrastive +learning approaches utilize a parameterized projection head. Through a +combination of empirical analysis and theoretical investigation, we provide +insights into the internal mechanisms of the projection head and its +relationship with the phenomenon of dimensional collapse. Our findings +demonstrate that the projection head enhances the quality of representations by +performing contrastive loss in a projected subspace. Therefore, we propose an +assumption that only a subset of features is necessary when minimizing the +contrastive loss of a mini-batch of data. Theoretical analysis further suggests +that a sparse projection head can enhance generalization, leading us to +introduce SparseHead - a regularization term that effectively constrains the +sparsity of the projection head, and can be seamlessly integrated with any +self-supervised learning (SSL) approaches. Our experimental results validate +the effectiveness of SparseHead, demonstrating its ability to improve the +performance of existing contrastive methods. + +
+
+ comment: 9 pages,3 figures +
+
+
+
+
+ + ☆ Sharpness-Aware Graph Collaborative Filtering + + +
+ Graph Neural Networks (GNNs) have achieved impressive performance in +collaborative filtering. However, GNNs tend to yield inferior performance when +the distributions of training and test data are not aligned well. Also, +training GNNs requires optimizing non-convex neural networks with an abundance +of local and global minima, which may differ widely in their performance at +test time. Thus, it is essential to choose the minima carefully. Here we +propose an effective training schema, called {gSAM}, under the principle that +the \textit{flatter} minima has a better generalization ability than the +\textit{sharper} ones. To achieve this goal, gSAM regularizes the flatness of +the weight loss landscape by forming a bi-level optimization: the outer problem +conducts the standard model training while the inner problem helps the model +jump out of the sharp minima. Experimental results show the superiority of our +gSAM. + +
+
+
+
+
+ + ☆ Convex Geometry of ReLU-layers, Injectivity on the Ball and Local + Reconstruction + + +
+ The paper uses a frame-theoretic setting to study the injectivity of a +ReLU-layer on the closed ball of $\mathbb{R}^n$ and its non-negative part. In +particular, the interplay between the radius of the ball and the bias vector is +emphasized. Together with a perspective from convex geometry, this leads to a +computationally feasible method of verifying the injectivity of a ReLU-layer +under reasonable restrictions in terms of an upper bound of the bias vector. +Explicit reconstruction formulas are provided, inspired by the duality concept +from frame theory. All this gives rise to the possibility of quantifying the +invertibility of a ReLU-layer and a concrete reconstruction algorithm for any +input vector on the ball. + +
+
+ comment: 10 pages main paper + 2 pages appendix, 4 figures, 2 algorithms, + conference +
+
+
+
+
+ + ☆ JAZZVAR: A Dataset of Variations found within Solo Piano Performances of + Jazz Standards for Music Overpainting + + +
+ Jazz pianists often uniquely interpret jazz standards. Passages from these +interpretations can be viewed as sections of variation. We manually extracted +such variations from solo jazz piano performances. The JAZZVAR dataset is a +collection of 502 pairs of Variation and Original MIDI segments. Each Variation +in the dataset is accompanied by a corresponding Original segment containing +the melody and chords from the original jazz standard. Our approach differs +from many existing jazz datasets in the music information retrieval (MIR) +community, which often focus on improvisation sections within jazz +performances. In this paper, we outline the curation process for obtaining and +sorting the repertoire, the pipeline for creating the Original and Variation +pairs, and our analysis of the dataset. We also introduce a new generative +music task, Music Overpainting, and present a baseline Transformer model +trained on the JAZZVAR dataset for this task. Other potential applications of +our dataset include expressive performance analysis and performer +identification. + +
+
+ comment: Pre-print accepted for publication at CMMR2023, 12 pages, 4 figures +
+
+
+
+
+ + ☆ Towards A Unified Agent with Foundation Models + + +
+ Language Models and Vision Language Models have recently demonstrated +unprecedented capabilities in terms of understanding human intentions, +reasoning, scene understanding, and planning-like behaviour, in text form, +among many others. In this work, we investigate how to embed and leverage such +abilities in Reinforcement Learning (RL) agents. We design a framework that +uses language as the core reasoning tool, exploring how this enables an agent +to tackle a series of fundamental RL challenges, such as efficient exploration, +reusing experience data, scheduling skills, and learning from observations, +which traditionally require separate, vertically designed algorithms. We test +our method on a sparse-reward simulated robotic manipulation environment, where +a robot needs to stack a set of objects. We demonstrate substantial performance +improvements over baselines in exploration efficiency and ability to reuse data +from offline datasets, and illustrate how to reuse learned skills to solve +novel tasks or imitate videos of human experts. + +
+
+
+
+
+ + ☆ Anticipating Technical Expertise and Capability Evolution in Research + Communities using Dynamic Graph Transformers + + +
+ The ability to anticipate technical expertise and capability evolution trends +globally is essential for national and global security, especially in +safety-critical domains like nuclear nonproliferation (NN) and rapidly emerging +fields like artificial intelligence (AI). In this work, we extend traditional +statistical relational learning approaches (e.g., link prediction in +collaboration networks) and formulate a problem of anticipating technical +expertise and capability evolution using dynamic heterogeneous graph +representations. We develop novel capabilities to forecast collaboration +patterns, authorship behavior, and technical capability evolution at different +granularities (e.g., scientist and institution levels) in two distinct research +fields. We implement a dynamic graph transformer (DGT) neural architecture, +which pushes the state-of-the-art graph neural network models by (a) +forecasting heterogeneous (rather than homogeneous) nodes and edges, and (b) +relying on both discrete -- and continuous -- time inputs. We demonstrate that +our DGT models predict collaboration, partnership, and expertise patterns with +0.26, 0.73, and 0.53 mean reciprocal rank values for AI and 0.48, 0.93, and +0.22 for NN domains. DGT model performance exceeds the best-performing static +graph baseline models by 30-80% across AI and NN domains. Our findings +demonstrate that DGT models boost inductive task performance, when previously +unseen nodes appear in the test data, for the domains with emerging +collaboration patterns (e.g., AI). Specifically, models accurately predict +which established scientists will collaborate with early career scientists and +vice-versa in the AI domain. + +
+
+
+
+
+ + ☆ Physics-based Reduced Order Modeling for Uncertainty Quantification of + Guided Wave Propagation using Bayesian Optimization + + +
+ In the context of digital twins, structural health monitoring (SHM) +constitutes the backbone of condition-based maintenance, facilitating the +interconnection between virtual and physical assets. Guided wave propagation +(GWP) is commonly employed for the inspection of structures in SHM. However, +GWP is sensitive to variations in the material properties of the structure, +leading to false alarms. In this direction, uncertainty quantification (UQ) is +regularly applied to improve the reliability of predictions. Computational +mechanics is a useful tool for the simulation of GWP, and is often applied for +UQ. Even so, the application of UQ methods requires numerous simulations, while +large-scale, transient numerical GWP solutions increase the computational cost. +Reduced order models (ROMs) are commonly employed to provide numerical results +in a limited amount of time. In this paper, we propose a machine learning +(ML)-based ROM, mentioned as BO-ML-ROM, to decrease the computational time +related to the simulation of the GWP. The ROM is integrated with a Bayesian +optimization (BO) framework, to adaptively sample the parameters for the ROM +training. The finite element method is used for the simulation of the +high-fidelity models. The formulated ROM is used for forward UQ of the GWP in +an aluminum plate with varying material properties. To determine the influence +of each parameter perturbation, a global, variance-based sensitivity analysis +is implemented based on Sobol' indices. It is shown that Bayesian optimization +outperforms one-shot sampling methods, both in terms of accuracy and speed-up. +The predicted results reveal the efficiency of BO-ML-ROM for GWP and +demonstrate its value for UQ. + +
+
+
+
+
+ + ☆ Neural Priority Queues for Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) have shown considerable success in neural +algorithmic reasoning. Many traditional algorithms make use of an explicit +memory in the form of a data structure. However, there has been limited +exploration on augmenting GNNs with external memory. In this paper, we present +Neural Priority Queues, a differentiable analogue to algorithmic priority +queues, for GNNs. We propose and motivate a desiderata for memory modules, and +show that Neural PQs exhibit the desiderata, and reason about their use with +algorithmic reasoning. This is further demonstrated by empirical results on the +CLRS-30 dataset. Furthermore, we find the Neural PQs useful in capturing +long-range interactions, as empirically shown on a dataset from the Long-Range +Graph Benchmark. + +
+
+
+
+
+ + ☆ HAT-CL: A Hard-Attention-to-the-Task PyTorch Library for Continual + Learning + + +
+ Catastrophic forgetting, the phenomenon in which a neural network loses +previously obtained knowledge during the learning of new tasks, poses a +significant challenge in continual learning. The Hard-Attention-to-the-Task +(HAT) mechanism has shown potential in mitigating this problem, but its +practical implementation has been complicated by issues of usability and +compatibility, and a lack of support for existing network reuse. In this paper, +we introduce HAT-CL, a user-friendly, PyTorch-compatible redesign of the HAT +mechanism. HAT-CL not only automates gradient manipulation but also streamlines +the transformation of PyTorch modules into HAT modules. It achieves this by +providing a comprehensive suite of modules that can be seamlessly integrated +into existing architectures. Additionally, HAT-CL offers ready-to-use HAT +networks that are smoothly integrated with the TIMM library. Beyond the +redesign and reimplementation of HAT, we also introduce novel mask manipulation +techniques for HAT, which have consistently shown improvements across various +experiments. Our work paves the way for a broader application of the HAT +mechanism, opening up new possibilities in continual learning across diverse +models and applications. + +
+
+
+
+
+ + ☆ Application of BadNets in Spam Filters ICDE23 + + +
+ Spam filters are a crucial component of modern email systems, as they help to +protect users from unwanted and potentially harmful emails. However, the +effectiveness of these filters is dependent on the quality of the machine +learning models that power them. In this paper, we design backdoor attacks in +the domain of spam filtering. By demonstrating the potential vulnerabilities in +the machine learning model supply chain, we highlight the need for careful +consideration and evaluation of the models used in spam filters. Our results +show that the backdoor attacks can be effectively used to identify +vulnerabilities in spam filters and suggest the need for ongoing monitoring and +improvement in this area. + +
+
+ comment: 5 pages, 4 figures, submitted to ICDE23 ASTRIDE, + https://astride-2023.github.io/assets/papers/CameraReady14.pdf +
+
+
+
+
+ + ☆ Promoting Exploration in Memory-Augmented Adam using Critical Momenta + + +
+ Adaptive gradient-based optimizers, particularly Adam, have left their mark +in training large-scale deep learning models. The strength of such optimizers +is that they exhibit fast convergence while being more robust to hyperparameter +choice. However, they often generalize worse than non-adaptive methods. Recent +studies have tied this performance gap to flat minima selection: adaptive +methods tend to find solutions in sharper basins of the loss landscape, which +in turn hurts generalization. To overcome this issue, we propose a new +memory-augmented version of Adam that promotes exploration towards flatter +minima by using a buffer of critical momentum terms during training. +Intuitively, the use of the buffer makes the optimizer overshoot outside the +basin of attraction if it is not wide enough. We empirically show that our +method improves the performance of several variants of Adam on standard +supervised language modelling and image classification tasks. + +
+
+
+
+
+ + ☆ Towards Federated Foundation Models: Scalable Dataset Pipelines for + Group-Structured Learning + + +
+ We introduce a library, Dataset Grouper, to create large-scale +group-structured (e.g., federated) datasets, enabling federated learning +simulation at the scale of foundation models. This library allows the creation +of group-structured versions of existing datasets based on user-specified +partitions, and directly leads to a variety of useful heterogeneous datasets +that can be plugged into existing software frameworks. Dataset Grouper offers +three key advantages. First, it scales to settings where even a single group's +dataset is too large to fit in memory. Second, it provides flexibility, both in +choosing the base (non-partitioned) dataset and in defining partitions. +Finally, it is framework-agnostic. We empirically demonstrate that Dataset +Grouper allows for large-scale federated language modeling simulations on +datasets that are orders of magnitude larger than in previous work. Our +experimental results show that algorithms like FedAvg operate more as +meta-learning methods than as empirical risk minimization methods at this +scale, suggesting their utility in downstream personalization and task-specific +adaptation. + +
+
+ comment: Dataset Grouper is available at + https://github.com/google-research/dataset_grouper +
+
+
+
+
+ + ☆ Gradient strikes back: How filtering out high frequencies improves + explanations + + +
+ Recent years have witnessed an explosion in the development of novel +prediction-based attribution methods, which have slowly been supplanting older +gradient-based methods to explain the decisions of deep neural networks. +However, it is still not clear why prediction-based methods outperform +gradient-based ones. Here, we start with an empirical observation: these two +approaches yield attribution maps with very different power spectra, with +gradient-based methods revealing more high-frequency content than +prediction-based methods. This observation raises multiple questions: What is +the source of this high-frequency information, and does it truly reflect +decisions made by the system? Lastly, why would the absence of high-frequency +information in prediction-based methods yield better explainability scores +along multiple metrics? We analyze the gradient of three representative visual +classification models and observe that it contains noisy information emanating +from high-frequencies. Furthermore, our analysis reveals that the operations +used in Convolutional Neural Networks (CNNs) for downsampling appear to be a +significant source of this high-frequency content -- suggesting aliasing as a +possible underlying basis. We then apply an optimal low-pass filter for +attribution maps and demonstrate that it improves gradient-based attribution +methods. We show that (i) removing high-frequency noise yields significant +improvements in the explainability scores obtained with gradient-based methods +across multiple models -- leading to (ii) a novel ranking of state-of-the-art +methods with gradient-based methods at the top. We believe that our results +will spur renewed interest in simpler and computationally more efficient +gradient-based methods for explainability. + +
+
+
+
+
+ + ☆ Self-Compatibility: Evaluating Causal Discovery without Ground Truth + + +
+ As causal ground truth is incredibly rare, causal discovery algorithms are +commonly only evaluated on simulated data. This is concerning, given that +simulations reflect common preconceptions about generating processes regarding +noise distributions, model classes, and more. In this work, we propose a novel +method for falsifying the output of a causal discovery algorithm in the absence +of ground truth. Our key insight is that while statistical learning seeks +stability across subsets of data points, causal learning should seek stability +across subsets of variables. Motivated by this insight, our method relies on a +notion of compatibility between causal graphs learned on different subsets of +variables. We prove that detecting incompatibilities can falsify wrongly +inferred causal relations due to violation of assumptions or errors from finite +sample effects. Although passing such compatibility tests is only a necessary +criterion for good performance, we argue that it provides strong evidence for +the causal models whenever compatibility entails strong implications for the +joint distribution. We also demonstrate experimentally that detection of +incompatibilities can aid in causal model selection. + +
+
+ comment: 28 pages, 10 figures +
+
+
+
+
+ + ☆ The semantic landscape paradigm for neural networks + + +
+ Deep neural networks exhibit a fascinating spectrum of phenomena ranging from +predictable scaling laws to the unpredictable emergence of new capabilities as +a function of training time, dataset size and network size. Analysis of these +phenomena has revealed the existence of concepts and algorithms encoded within +the learned representations of these networks. While significant strides have +been made in explaining observed phenomena separately, a unified framework for +understanding, dissecting, and predicting the performance of neural networks is +lacking. Here, we introduce the semantic landscape paradigm, a conceptual and +mathematical framework that describes the training dynamics of neural networks +as trajectories on a graph whose nodes correspond to emergent algorithms that +are instrinsic to the learned representations of the networks. This abstraction +enables us to describe a wide range of neural network phenomena in terms of +well studied problems in statistical physics. Specifically, we show that +grokking and emergence with scale are associated with percolation phenomena, +and neural scaling laws are explainable in terms of the statistics of random +walks on graphs. Finally, we discuss how the semantic landscape paradigm +complements existing theoretical and practical approaches aimed at +understanding and interpreting deep neural networks. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ DreaMR: Diffusion-driven Counterfactual Explanation for Functional MRI + + +
+ Deep learning analyses have offered sensitivity leaps in detection of +cognitive states from functional MRI (fMRI) measurements across the brain. Yet, +as deep models perform hierarchical nonlinear transformations on their input, +interpreting the association between brain responses and cognitive states is +challenging. Among common explanation approaches for deep fMRI classifiers, +attribution methods show poor specificity and perturbation methods show limited +plausibility. While counterfactual generation promises to address these +limitations, previous methods use variational or adversarial priors that yield +suboptimal sample fidelity. Here, we introduce the first diffusion-driven +counterfactual method, DreaMR, to enable fMRI interpretation with high +specificity, plausibility and fidelity. DreaMR performs diffusion-based +resampling of an input fMRI sample to alter the decision of a downstream +classifier, and then computes the minimal difference between the original and +counterfactual samples for explanation. Unlike conventional diffusion methods, +DreaMR leverages a novel fractional multi-phase-distilled diffusion prior to +improve sampling efficiency without compromising fidelity, and it employs a +transformer architecture to account for long-range spatiotemporal context in +fMRI scans. Comprehensive experiments on neuroimaging datasets demonstrate the +superior specificity, fidelity and efficiency of DreaMR in sample generation +over state-of-the-art counterfactual methods for fMRI interpretation. + +
+
+
+
+
+ + ☆ Can Neural Network Memorization Be Localized? ICML 2023 + + +
+ Recent efforts at explaining the interplay of memorization and generalization +in deep overparametrized networks have posited that neural networks +$\textit{memorize}$ "hard" examples in the final few layers of the model. +Memorization refers to the ability to correctly predict on $\textit{atypical}$ +examples of the training set. In this work, we show that rather than being +confined to individual layers, memorization is a phenomenon confined to a small +set of neurons in various layers of the model. First, via three experimental +sources of converging evidence, we find that most layers are redundant for the +memorization of examples and the layers that contribute to example memorization +are, in general, not the final layers. The three sources are $\textit{gradient +accounting}$ (measuring the contribution to the gradient norms from memorized +and clean examples), $\textit{layer rewinding}$ (replacing specific model +weights of a converged model with previous training checkpoints), and +$\textit{retraining}$ (training rewound layers only on clean examples). Second, +we ask a more generic question: can memorization be localized +$\textit{anywhere}$ in a model? We discover that memorization is often confined +to a small number of neurons or channels (around 5) of the model. Based on +these insights we propose a new form of dropout -- $\textit{example-tied +dropout}$ that enables us to direct the memorization of examples to an apriori +determined set of neurons. By dropping out these neurons, we are able to reduce +the accuracy on memorized examples from $100\%\to3\%$, while also reducing the +generalization gap. + +
+
+ comment: Accepted at ICML 2023 +
+
+
+
+
+ + ♻ ☆ SparseOptimizer: Sparsify Language Models through Moreau-Yosida + Regularization and Accelerate via Compiler Co-design + + +
+ This paper introduces SparseOptimizer, a novel deep learning optimizer that +exploits Moreau-Yosida regularization to naturally induce sparsity in large +language models such as BERT, ALBERT and GPT. Key to the design of +SparseOptimizer is an embedded shrinkage operator, which imparts sparsity +directly within the optimization process. This operator, backed by a sound +theoretical framework, includes an analytical solution, thereby reinforcing the +optimizer's robustness and efficacy. Crucially, SparseOptimizer's plug-and-play +functionality eradicates the need for code modifications, making it a +universally adaptable tool for a wide array of large language models. Empirical +evaluations on benchmark datasets such as GLUE, RACE, SQuAD1, and SQuAD2 +confirm that SparseBERT and SparseALBERT, when sparsified using +SparseOptimizer, achieve performance comparable to their dense counterparts, +BERT and ALBERT, while significantly reducing their parameter count. Further, +this work proposes an innovative optimizer-compiler co-design strategy, +demonstrating the potential of inference acceleration (\textbf{3.37x}, +\textbf{6.30x}, and \textbf{7.15x} in comparison with Pytorch, TensorFlow, and +LLVM generic compile, respectively) in SparseBERT when paired with an +appropriately designed compiler. This study represents a significant step +forward in the evolution of efficient, scalable, and high-performing large +language models, setting a precedent for future exploration and optimization in +this domain. The SparseOptimizer code and SparseALBERT model will be publicly +available upon paper acceptance. + +
+
+
+
+
+ + ♻ ☆ Funnel-based Reward Shaping for Signal Temporal Logic Tasks in + Reinforcement Learning + + +
+ Signal Temporal Logic (STL) is a powerful framework for describing the +complex temporal and logical behaviour of the dynamical system. Numerous +studies have attempted to employ reinforcement learning to learn a controller +that enforces STL specifications; however, they have been unable to effectively +tackle the challenges of ensuring robust satisfaction in continuous state space +and maintaining tractability. In this paper, leveraging the concept of funnel +functions, we propose a tractable reinforcement learning algorithm to learn a +time-dependent policy for robust satisfaction of STL specification in +continuous state space. We demonstrate the utility of our approach on several +STL tasks using different environments. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Machine Learning Enhanced Hankel Dynamic-Mode Decomposition + + +
+ While the acquisition of time series has become more straightforward, +developing dynamical models from time series is still a challenging and +evolving problem domain. Within the last several years, to address this +problem, there has been a merging of machine learning tools with what is called +the dynamic mode decomposition (DMD). This general approach has been shown to +be an especially promising avenue for accurate model development. Building on +this prior body of work, we develop a deep learning DMD based method which +makes use of the fundamental insight of Takens' Embedding Theorem to build an +adaptive learning scheme that better approximates higher dimensional and +chaotic dynamics. We call this method the Deep Learning Hankel DMD (DLHDMD). We +likewise explore how our method learns mappings which tend, after successful +training, to significantly change the mutual information between dimensions in +the dynamics. This appears to be a key feature in enhancing the DMD overall, +and it should help provide further insight for developing other deep learning +methods for time series analysis and model generation. + +
+
+
+
+
+ + ♻ ☆ Weighted Averaged Stochastic Gradient Descent: Asymptotic Normality and + Optimality + + +
+ Stochastic Gradient Descent (SGD) is one of the simplest and most popular +algorithms in modern statistical and machine learning due to its computational +and memory efficiency. Various averaging schemes have been proposed to +accelerate the convergence of SGD in different settings. In this paper, we +explore a general averaging scheme for SGD. Specifically, we establish the +asymptotic normality of a broad range of weighted averaged SGD solutions and +provide asymptotically valid online inference approaches. Furthermore, we +propose an adaptive averaging scheme that exhibits both optimal statistical +rate and favorable non-asymptotic convergence, drawing insights from the +optimal weight for the linear model in terms of non-asymptotic mean squared +error (MSE). + +
+
+
+
+
+ + ♻ ☆ Execution-based Code Generation using Deep Reinforcement Learning + + +
+ The utilization of programming language (PL) models, pre-trained on +large-scale code corpora, as a means of automating software engineering +processes has demonstrated considerable potential in streamlining various code +generation tasks such as code completion, code translation, and program +synthesis. However, current approaches mainly rely on supervised fine-tuning +objectives borrowed from text generation, neglecting unique sequence-level +characteristics of code, including but not limited to compilability as well as +syntactic and functional correctness. To address this limitation, we propose +PPOCoder, a new framework for code generation that synergistically combines +pre-trained PL models with Proximal Policy Optimization (PPO) which is a widely +used deep reinforcement learning technique. By utilizing non-differentiable +feedback from code execution and structure alignment, PPOCoder seamlessly +integrates external code-specific knowledge into the model optimization +process. It's important to note that PPOCoder is a task-agnostic and +model-agnostic framework that can be used across different code generation +tasks and PLs. Extensive experiments on three code generation tasks demonstrate +the effectiveness of our proposed approach compared to SOTA methods, achieving +significant improvements in compilation success rates and functional +correctness across different PLs. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR), 2023 +
+
+
+
+
+ + ♻ ☆ Mitigating Transformer Overconfidence via Lipschitz Regularization UAI 2023 + + +
+ Though Transformers have achieved promising results in many computer vision +tasks, they tend to be over-confident in predictions, as the standard Dot +Product Self-Attention (DPSA) can barely preserve distance for the unbounded +input domain. In this work, we fill this gap by proposing a novel Lipschitz +Regularized Transformer (LRFormer). Specifically, we present a new similarity +function with the distance within Banach Space to ensure the Lipschitzness and +also regularize the term by a contractive Lipschitz Bound. The proposed method +is analyzed with a theoretical guarantee, providing a rigorous basis for its +effectiveness and reliability. Extensive experiments conducted on standard +vision benchmarks demonstrate that our method outperforms the state-of-the-art +single forward pass approaches in prediction, calibration, and uncertainty +estimation. + +
+
+ comment: Accepted by UAI 2023. (https://proceedings.mlr.press/v216/ye23a.html) +
+
+
+
+
+ + ♻ ☆ Improving Image-Based Precision Medicine with Uncertainty-Aware Causal + Models + + +
+ Image-based precision medicine aims to personalize treatment decisions based +on an individual's unique imaging features so as to improve their clinical +outcome. Machine learning frameworks that integrate uncertainty estimation as +part of their treatment recommendations would be safer and more reliable. +However, little work has been done in adapting uncertainty estimation +techniques and validation metrics for precision medicine. In this paper, we use +Bayesian deep learning for estimating the posterior distribution over factual +and counterfactual outcomes on several treatments. This allows for estimating +the uncertainty for each treatment option and for the individual treatment +effects (ITE) between any two treatments. We train and evaluate this model to +predict future new and enlarging T2 lesion counts on a large, multi-center +dataset of MR brain images of patients with multiple sclerosis, exposed to +several treatments during randomized controlled trials. We evaluate the +correlation of the uncertainty estimate with the factual error, and, given the +lack of ground truth counterfactual outcomes, demonstrate how uncertainty for +the ITE prediction relates to bounds on the ITE error. Lastly, we demonstrate +how knowledge of uncertainty could modify clinical decision-making to improve +individual patient and clinical trial outcomes. + +
+
+
+
+
+ + ♻ ☆ Robust online active learning + + +
+ In many industrial applications, obtaining labeled observations is not +straightforward as it often requires the intervention of human experts or the +use of expensive testing equipment. In these circumstances, active learning can +be highly beneficial in suggesting the most informative data points to be used +when fitting a model. Reducing the number of observations needed for model +development alleviates both the computational burden required for training and +the operational expenses related to labeling. Online active learning, in +particular, is useful in high-volume production processes where the decision +about the acquisition of the label for a data point needs to be taken within an +extremely short time frame. However, despite the recent efforts to develop +online active learning strategies, the behavior of these methods in the +presence of outliers has not been thoroughly examined. In this work, we +investigate the performance of online active linear regression in contaminated +data streams. Our study shows that the currently available query strategies are +prone to sample outliers, whose inclusion in the training set eventually +degrades the predictive performance of the models. To address this issue, we +propose a solution that bounds the search area of a conditional D-optimal +algorithm and uses a robust estimator. Our approach strikes a balance between +exploring unseen regions of the input space and protecting against outliers. +Through numerical simulations, we show that the proposed method is effective in +improving the performance of online active learning in the presence of +outliers, thus expanding the potential applications of this powerful tool. + +
+
+ comment: Published in Quality and Reliability Engineering International (2023) +
+
+
+
+
+ + ♻ ☆ The Score-Difference Flow for Implicit Generative Modeling + + +
+ Implicit generative modeling (IGM) aims to produce samples of synthetic data +matching the characteristics of a target data distribution. Recent work (e.g. +score-matching networks, diffusion models) has approached the IGM problem from +the perspective of pushing synthetic source data toward the target distribution +via dynamical perturbations or flows in the ambient space. In this direction, +we present the score difference (SD) between arbitrary target and source +distributions as a flow that optimally reduces the Kullback-Leibler divergence +between them while also solving the Schroedinger bridge problem. We apply the +SD flow to convenient proxy distributions, which are aligned if and only if the +original distributions are aligned. We demonstrate the formal equivalence of +this formulation to denoising diffusion models under certain conditions. We +also show that the training of generative adversarial networks includes a +hidden data-optimization sub-problem, which induces the SD flow under certain +choices of loss function when the discriminator is optimal. As a result, the SD +flow provides a theoretical link between model classes that individually +address the three challenges of the "generative modeling trilemma" -- high +sample quality, mode coverage, and fast sampling -- thereby setting the stage +for a unified approach. + +
+
+ comment: 25 pages, 5 figures, 4 tables. To appear in Transactions on Machine + Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Gradient Surgery for One-shot Unlearning on Generative Model ICML 2023 + + +
+ Recent regulation on right-to-be-forgotten emerges tons of interest in +unlearning pre-trained machine learning models. While approximating a +straightforward yet expensive approach of retrain-from-scratch, recent machine +unlearning methods unlearn a sample by updating weights to remove its influence +on the weight parameters. In this paper, we introduce a simple yet effective +approach to remove a data influence on the deep generative model. Inspired by +works in multi-task learning, we propose to manipulate gradients to regularize +the interplay of influence among samples by projecting gradients onto the +normal plane of the gradients to be retained. Our work is agnostic to +statistics of the removal samples, outperforming existing baselines while +providing theoretical analysis for the first time in unlearning a generative +model. + +
+
+ comment: ICML 2023 Workshop on Generative AI & Law +
+
+
+
+
+ + ♻ ☆ TableGPT: Towards Unifying Tables, Nature Language and Commands into One + GPT + + +
+ Tables are prevalent in real-world databases, requiring significant time and +effort for humans to analyze and manipulate. The advancements in large language +models (LLMs) have made it possible to interact with tables using natural +language input, bringing this capability closer to reality. In this paper, we +present TableGPT, a unified fine-tuned framework that enables LLMs to +understand and operate on tables using external functional commands. It +introduces the capability to seamlessly interact with tables, enabling a wide +range of functionalities such as question answering, data manipulation (e.g., +insert, delete, query, and modify operations), data visualization, analysis +report generation, and automated prediction. TableGPT aims to provide +convenience and accessibility to users by empowering them to effortlessly +leverage tabular data. At the core of TableGPT lies the novel concept of global +tabular representations, which empowers LLMs to gain a comprehensive +understanding of the entire table beyond meta-information. By jointly training +LLMs on both table and text modalities, TableGPT achieves a deep understanding +of tabular data and the ability to perform complex operations on tables through +chain-of-command instructions. Importantly, TableGPT offers the advantage of +being a self-contained system rather than relying on external API interfaces. +Moreover, it supports efficient data process flow, query rejection (when +appropriate) and private deployment, enabling faster domain data fine-tuning +and ensuring data privacy, which enhances the framework's adaptability to +specific use cases. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ A survey on learning from imbalanced data streams: taxonomy, challenges, + empirical study, and reproducible experimental framework + + +
+ Class imbalance poses new challenges when it comes to classifying data +streams. Many algorithms recently proposed in the literature tackle this +problem using a variety of data-level, algorithm-level, and ensemble +approaches. However, there is a lack of standardized and agreed-upon procedures +and benchmarks on how to evaluate these algorithms. This work proposes a +standardized, exhaustive, and comprehensive experimental framework to evaluate +algorithms in a collection of diverse and challenging imbalanced data stream +scenarios. The experimental study evaluates 24 state-of-the-art data streams +algorithms on 515 imbalanced data streams that combine static and dynamic class +imbalance ratios, instance-level difficulties, concept drift, real-world and +semi-synthetic datasets in binary and multi-class scenarios. This leads to a +large-scale experimental study comparing state-of-the-art classifiers in the +data stream mining domain. We discuss the advantages and disadvantages of +state-of-the-art classifiers in each of these scenarios and we provide general +recommendations to end-users for selecting the best algorithms for imbalanced +data streams. Additionally, we formulate open challenges and future directions +for this domain. Our experimental framework is fully reproducible and easy to +extend with new methods. This way, we propose a standardized approach to +conducting experiments in imbalanced data streams that can be used by other +researchers to create complete, trustworthy, and fair evaluation of newly +proposed methods. Our experimental framework can be downloaded from +https://github.com/canoalberto/imbalanced-streams. + +
+
+
+
+
+ + ♻ ☆ Deep Learning with Passive Optical Nonlinear Mapping + + +
+ Deep learning has fundamentally transformed artificial intelligence, but the +ever-increasing complexity in deep learning models calls for specialized +hardware accelerators. Optical accelerators can potentially offer enhanced +performance, scalability, and energy efficiency. However, achieving nonlinear +mapping, a critical component of neural networks, remains challenging +optically. Here, we introduce a design that leverages multiple scattering in a +reverberating cavity to passively induce optical nonlinear random mapping, +without the need for additional laser power. A key advantage emerging from our +work is that we show we can perform optical data compression, facilitated by +multiple scattering in the cavity, to efficiently compress and retain vital +information while also decreasing data dimensionality. This allows rapid +optical information processing and generation of low dimensional mixtures of +highly nonlinear features. These are particularly useful for applications +demanding high-speed analysis and responses such as in edge computing devices. +Utilizing rapid optical information processing capabilities, our optical +platforms could potentially offer more efficient and real-time processing +solutions for a broad range of applications. We demonstrate the efficacy of our +design in improving computational performance across tasks, including +classification, image reconstruction, key-point detection, and object +detection, all achieved through optical data compression combined with a +digital decoder. Notably, we observed high performance, at an extreme +compression ratio, for real-time pedestrian detection. Our findings pave the +way for novel algorithms and architectural designs for optical computing. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ High Fidelity Image Counterfactuals with Probabilistic Causal Models ICML2023 + + +
+ We present a general causal generative modelling framework for accurate +estimation of high fidelity image counterfactuals with deep structural causal +models. Estimation of interventional and counterfactual queries for +high-dimensional structured variables, such as images, remains a challenging +task. We leverage ideas from causal mediation analysis and advances in +generative modelling to design new deep causal mechanisms for structured +variables in causal models. Our experiments demonstrate that our proposed +mechanisms are capable of accurate abduction and estimation of direct, indirect +and total effects as measured by axiomatic soundness of counterfactuals. + +
+
+ comment: ICML2023 publication +
+
+
+
+
+ + ♻ ☆ Martian time-series unraveled: A multi-scale nested approach with + factorial variational autoencoders + + +
+ Unsupervised source separation involves unraveling an unknown set of source +signals recorded through a mixing operator, with limited prior knowledge about +the sources, and only access to a dataset of signal mixtures. This problem is +inherently ill-posed and is further challenged by the variety of time-scales +exhibited by sources in time series data. Existing methods typically rely on a +preselected window size that limits their capacity to handle multi-scale +sources. To address this issue, instead of operating in the time domain, we +propose an unsupervised multi-scale clustering and source separation framework +by leveraging wavelet scattering covariances that provide a low-dimensional +representation of stochastic processes, capable of distinguishing between +different non-Gaussian stochastic processes. Nested within this representation +space, we develop a factorial Gaussian-mixture variational autoencoder that is +trained to (1) probabilistically cluster sources at different time-scales and +(2) independently sample scattering covariance representations associated with +each cluster. Using samples from each cluster as prior information, we +formulate source separation as an optimization problem in the wavelet +scattering covariance representation space, resulting in separated sources in +the time domain. When applied to seismic data recorded during the NASA InSight +mission on Mars, our multi-scale nested approach proves to be a powerful tool +for discriminating between sources varying greatly in time-scale, e.g., +minute-long transient one-sided pulses (known as ``glitches'') and structured +ambient noises resulting from atmospheric activities that typically last for +tens of minutes. These results provide an opportunity to conduct further +investigations into the isolated sources related to atmospheric-surface +interactions, thermal relaxations, and other complex phenomena. + +
+
+
+
+
+ + ♻ ☆ FakET: Simulating Cryo-Electron Tomograms with Neural Style Transfer + + +
+ Particle localization and -classification constitute two of the most +fundamental problems in computational microscopy. In recent years, deep +learning based approaches have been introduced for these tasks with great +success. A key shortcoming of these supervised learning methods is their need +for large training data sets, typically generated from particle models in +conjunction with complex numerical forward models simulating the physics of +transmission electron microscopes. Computer implementations of such forward +models are computationally extremely demanding and limit the scope of their +applicability. In this paper we propose a method for simulating the forward +operator of an electron microscope based on additive noise and Neural Style +Transfer techniques. We evaluate the method on localization and classification +tasks using one of the established state-of-the-art architectures showing +performance on par with the benchmark. In contrast to previous approaches, our +method accelerates the data generation process by a factor of 750 while using +33 times less memory and scales well to typical transmission electron +microscope detector sizes. It utilizes GPU acceleration and parallel +processing. It can be used to adapt a synthetic training data set according to +reference data from any transmission electron microscope. The source code is +available at https://gitlab.com/deepet/faket. + +
+
+ comment: 18 pages, 1 table, 16 figures. Included fine-tuning, ablation, and + noiseless experiments +
+
+
+
+
+ + ♻ ☆ High-Probability Bounds for Stochastic Optimization and Variational + Inequalities: the Case of Unbounded Variance ICML 2023 + + +
+ During recent years the interest of optimization and machine learning +communities in high-probability convergence of stochastic optimization methods +has been growing. One of the main reasons for this is that high-probability +complexity bounds are more accurate and less studied than in-expectation ones. +However, SOTA high-probability non-asymptotic convergence results are derived +under strong assumptions such as the boundedness of the gradient noise variance +or of the objective's gradient itself. In this paper, we propose several +algorithms with high-probability convergence results under less restrictive +assumptions. In particular, we derive new high-probability convergence results +under the assumption that the gradient/operator noise has bounded central +$\alpha$-th moment for $\alpha \in (1,2]$ in the following setups: (i) smooth +non-convex / Polyak-Lojasiewicz / convex / strongly convex / quasi-strongly +convex minimization problems, (ii) Lipschitz / star-cocoercive and monotone / +quasi-strongly monotone variational inequalities. These results justify the +usage of the considered methods for solving problems that do not fit standard +functional classes studied in stochastic optimization. + +
+
+ comment: ICML 2023. 86 pages. Changes in v2: ICML formatting was applied along + with minor edits of the text +
+
+
+
+
+ + ♻ ☆ Multi-class point cloud completion networks for 3D cardiac anatomy + reconstruction from cine magnetic resonance images + + +
+ Cine magnetic resonance imaging (MRI) is the current gold standard for the +assessment of cardiac anatomy and function. However, it typically only acquires +a set of two-dimensional (2D) slices of the underlying three-dimensional (3D) +anatomy of the heart, thus limiting the understanding and analysis of both +healthy and pathological cardiac morphology and physiology. In this paper, we +propose a novel fully automatic surface reconstruction pipeline capable of +reconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI +acquisitions. Its key component is a multi-class point cloud completion network +(PCCN) capable of correcting both the sparsity and misalignment issues of the +3D reconstruction task in a unified model. We first evaluate the PCCN on a +large synthetic dataset of biventricular anatomies and observe Chamfer +distances between reconstructed and gold standard anatomies below or similar to +the underlying image resolution for multiple levels of slice misalignment. +Furthermore, we find a reduction in reconstruction error compared to a +benchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean +surface distance, respectively. We then apply the PCCN as part of our automated +reconstruction pipeline to 1000 subjects from the UK Biobank study in a +cross-domain transfer setting and demonstrate its ability to reconstruct +accurate and topologically plausible biventricular heart meshes with clinical +metrics comparable to the previous literature. Finally, we investigate the +robustness of our proposed approach and observe its capacity to successfully +handle multiple common outlier conditions. + +
+
+
+
+
+ + ♻ ☆ K-Tensors: Clustering Positive Semi-Definite Matrices + + +
+ This paper introduces a novel self-consistency clustering algorithm +($K$-Tensors) designed for {partitioning a distribution of} +positive-semidefinite matrices based on their eigenstructures. As positive +semi-definite matrices can be represented as ellipsoids in $\mathbb R^p$, $p +\ge 2$, it is critical to maintain their structural information to perform +effective clustering. However, traditional clustering algorithms {applied to +matrices} often {involve vectorization of} the matrices, resulting in a loss of +essential structural information. To address this issue, we propose a distance +metric {for clustering} that is specifically based on the structural +information of positive semi-definite matrices. This distance metric enables +the clustering algorithm to consider the differences between positive +semi-definite matrices and their projections onto {a} common space spanned by +\thadJulyTen{orthonormal vectors defined from a set of} positive semi-definite +matrices. This innovative approach to clustering positive semi-definite +matrices has broad applications in several domains including financial and +biomedical research, such as analyzing functional connectivity data. By +maintaining the structural information of positive semi-definite matrices, our +proposed algorithm promises to cluster the positive semi-definite matrices in a +more meaningful way, thereby facilitating deeper insights into the underlying +data in various applications. + +
+
+
+
+
+ + ♻ ☆ TabText: A Flexible and Contextual Approach to Tabular Data + Representation + + +
+ Tabular data is essential for applying machine learning tasks across various +industries. However, traditional data processing methods do not fully utilize +all the information available in the tables, ignoring important contextual +information such as column header descriptions. In addition, pre-processing +data into a tabular format can remain a labor-intensive bottleneck in model +development. This work introduces TabText, a processing and feature extraction +framework that extracts contextual information from tabular data structures. +TabText addresses processing difficulties by converting the content into +language and utilizing pre-trained large language models (LLMs). We evaluate +our framework on nine healthcare prediction tasks ranging from patient +discharge, ICU admission, and mortality. We show that 1) applying our TabText +framework enables the generation of high-performing and simple machine learning +baseline models with minimal data pre-processing, and 2) augmenting +pre-processed tabular data with TabText representations improves the average +and worst-case AUC performance of standard machine learning models by as much +as 6%. + +
+
+
+
+
+ + ♻ ☆ SLCA: Slow Learner with Classifier Alignment for Continual Learning on a + Pre-trained Model ICCV 2023 + + +
+ The goal of continual learning is to improve the performance of recognition +models in learning sequentially arrived data. Although most existing works are +established on the premise of learning from scratch, growing efforts have been +devoted to incorporating the benefits of pre-training. However, how to +adaptively exploit the pre-trained knowledge for each incremental task while +maintaining its generalizability remains an open question. In this work, we +present an extensive analysis for continual learning on a pre-trained model +(CLPM), and attribute the key challenge to a progressive overfitting problem. +Observing that selectively reducing the learning rate can almost resolve this +issue in the representation layer, we propose a simple but extremely effective +approach named Slow Learner with Classifier Alignment (SLCA), which further +improves the classification layer by modeling the class-wise distributions and +aligning the classification layers in a post-hoc fashion. Across a variety of +scenarios, our proposal provides substantial improvements for CLPM (e.g., up to +49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split +CUB-200 and Split Cars-196, respectively), and thus outperforms +state-of-the-art approaches by a large margin. Based on such a strong baseline, +critical factors and promising directions are analyzed in-depth to facilitate +subsequent research. + +
+
+ comment: 11 pages, 8 figures, accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Meta-Polyp: a baseline for efficient Polyp segmentation + + +
+ In recent years, polyp segmentation has gained significant importance, and +many methods have been developed using CNN, Vision Transformer, and Transformer +techniques to achieve competitive results. However, these methods often face +difficulties when dealing with out-of-distribution datasets, missing +boundaries, and small polyps. In 2022, Meta-Former was introduced as a new +baseline for vision, which not only improved the performance of multi-task +computer vision but also addressed the limitations of the Vision Transformer +and CNN family backbones. To further enhance segmentation, we propose a fusion +of Meta-Former with UNet, along with the introduction of a Multi-scale +Upsampling block with a level-up combination in the decoder stage to enhance +the texture, also we propose the Convformer block base on the idea of the +Meta-former to enhance the crucial information of the local feature. These +blocks enable the combination of global information, such as the overall shape +of the polyp, with local information and boundary information, which is crucial +for the decision of the medical segmentation. Our proposed approach achieved +competitive performance and obtained the top result in the State of the Art on +the CVC-300 dataset, Kvasir, and CVC-ColonDB dataset. Apart from Kvasir-SEG, +others are out-of-distribution datasets. The implementation can be found at: +https://github.com/huyquoctrinh/MetaPolyp-CBMS2023. + +
+
+
+
+
+ + ♻ ☆ Causal-Based Supervision of Attention in Graph Neural Network: A Better + and Simpler Choice towards Powerful Attention + + +
+ Recent years have witnessed the great potential of attention mechanism in +graph representation learning. However, while variants of attention-based GNNs +are setting new benchmarks for numerous real-world datasets, recent works have +pointed out that their induced attentions are less robust and generalizable +against noisy graphs due to lack of direct supervision. In this paper, we +present a new framework which utilizes the tool of causality to provide a +powerful supervision signal for the learning process of attention functions. +Specifically, we estimate the direct causal effect of attention to the final +prediction, and then maximize such effect to guide attention attending to more +meaningful neighbors. Our method can serve as a plug-and-play module for any +canonical attention-based GNNs in an end-to-end fashion. Extensive experiments +on a wide range of benchmark datasets illustrated that, by directly supervising +attention functions, the model is able to converge faster with a clearer +decision boundary, and thus yields better performances. + +
+
+
+
+
+ + ♻ ☆ Nonlinear Processing with Linear Optics + + +
+ Deep neural networks have achieved remarkable breakthroughs by leveraging +multiple layers of data processing to extract hidden representations, albeit at +the cost of large electronic computing power. To enhance energy efficiency and +speed, the optical implementation of neural networks aims to harness the +advantages of optical bandwidth and the energy efficiency of optical +interconnections. In the absence of low-power optical nonlinearities, the +challenge in the implementation of multilayer optical networks lies in +realizing multiple optical layers without resorting to electronic components. +In this study, we present a novel framework that uses multiple scattering that +is capable of synthesizing programmable linear and nonlinear transformations +concurrently at low optical power by leveraging the nonlinear relationship +between the scattering potential, represented by data, and the scattered field. +Theoretical and experimental investigations show that repeating the data by +multiple scattering enables non-linear optical computing at low power +continuous wave light. + +
+
+ comment: 20 pages, 9 figures and 1 table +
+
+
+
+
+ + ♻ ☆ Do DL models and training environments have an impact on energy + consumption? + + +
+ Current research in the computer vision field mainly focuses on improving +Deep Learning (DL) correctness and inference time performance. However, there +is still little work on the huge carbon footprint that has training DL models. +This study aims to analyze the impact of the model architecture and training +environment when training greener computer vision models. We divide this goal +into two research questions. First, we analyze the effects of model +architecture on achieving greener models while keeping correctness at optimal +levels. Second, we study the influence of the training environment on producing +greener models. To investigate these relationships, we collect multiple metrics +related to energy efficiency and model correctness during the models' training. +Then, we outline the trade-offs between the measured energy efficiency and the +models' correctness regarding model architecture, and their relationship with +the training environment. We conduct this research in the context of a computer +vision system for image classification. In conclusion, we show that selecting +the proper model architecture and training environment can reduce energy +consumption dramatically (up to 98.83%) at the cost of negligible decreases in +correctness. Also, we find evidence that GPUs should scale with the models' +computational complexity for better energy efficiency. + +
+
+ comment: 49th Euromicro Conference Series on Software Engineering and Advanced + Applications (SEAA). 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Geometric Ultrasound Localization Microscopy MICCAI 2023 + + +
+ Contrast-Enhanced Ultra-Sound (CEUS) has become a viable method for +non-invasive, dynamic visualization in medical diagnostics, yet Ultrasound +Localization Microscopy (ULM) has enabled a revolutionary breakthrough by +offering ten times higher resolution. To date, Delay-And-Sum (DAS) beamformers +are used to render ULM frames, ultimately determining the image resolution +capability. To take full advantage of ULM, this study questions whether +beamforming is the most effective processing step for ULM, suggesting an +alternative approach that relies solely on Time-Difference-of-Arrival (TDoA) +information. To this end, a novel geometric framework for micro bubble +localization via ellipse intersections is proposed to overcome existing +beamforming limitations. We present a benchmark comparison based on a public +dataset for which our geometric ULM outperforms existing baseline methods in +terms of accuracy and robustness while only utilizing a portion of the +available transducer data. + +
+
+ comment: Pre-print accepted for MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Stochastic Optimal Control for Collective Variable Free Sampling of + Molecular Transition Paths + + +
+ We consider the problem of sampling transition paths between two given +metastable states of a molecular system, e.g. a folded and unfolded protein or +products and reactants of a chemical reaction. Due to the existence of high +energy barriers separating the states, these transition paths are unlikely to +be sampled with standard Molecular Dynamics (MD) simulation. Traditional +methods to augment MD with a bias potential to increase the probability of the +transition rely on a dimensionality reduction step based on Collective +Variables (CVs). Unfortunately, selecting appropriate CVs requires chemical +intuition and traditional methods are therefore not always applicable to larger +systems. Additionally, when incorrect CVs are used, the bias potential might +not be minimal and bias the system along dimensions irrelevant to the +transition. Showing a formal relation between the problem of sampling molecular +transition paths, the Schr\"odinger bridge problem and stochastic optimal +control with neural network policies, we propose a machine learning method for +sampling said transitions. Unlike previous non-machine learning approaches our +method, named PIPS, does not depend on CVs. We show that our method successful +generates low energy transitions for Alanine Dipeptide as well as the larger +Polyproline and Chignolin proteins. + +
+
+
+
+
+ + ♻ ☆ Deep Riemannian Networks for EEG Decoding + + +
+ State-of-the-art performance in electroencephalography (EEG) decoding tasks +is currently often achieved with either Deep-Learning (DL) or +Riemannian-Geometry-based decoders (RBDs). Recently, there is growing interest +in Deep Riemannian Networks (DRNs) possibly combining the advantages of both +previous classes of methods. However, there are still a range of topics where +additional insight is needed to pave the way for a more widespread application +of DRNs in EEG. These include architecture design questions such as network +size and end-to-end ability.How these factors affect model performance has not +been explored. Additionally, it is not clear how the data within these networks +is transformed, and whether this would correlate with traditional EEG decoding. +Our study aims to lay the groundwork in the area of these topics through the +analysis of DRNs for EEG with a wide range of hyperparameters. Networks were +tested on two public EEG datasets and compared with state-of-the-art ConvNets. +Here we propose end-to-end EEG SPDNet (EE(G)-SPDNet), and we show that this +wide, end-to-end DRN can outperform the ConvNets, and in doing so use +physiologically plausible frequency regions. We also show that the end-to-end +approach learns more complex filters than traditional band-pass filters +targeting the classical alpha, beta, and gamma frequency bands of the EEG, and +that performance can benefit from channel specific filtering approaches. +Additionally, architectural analysis revealed areas for further improvement due +to the possible loss of Riemannian specific information throughout the network. +Our study thus shows how to design and train DRNs to infer task-related +information from the raw EEG without the need of handcrafted filterbanks and +highlights the potential of end-to-end DRNs such as EE(G)-SPDNet for +high-performance EEG decoding. + +
+
+ comment: 27 pages, 13 Figures +
+
+
+
+
+ + ♻ ☆ Scalable Coupling of Deep Learning with Logical Reasoning IJCAI'2023 + + +
+ In the ongoing quest for hybridizing discrete reasoning with neural nets, +there is an increasing interest in neural architectures that can learn how to +solve discrete reasoning or optimization problems from natural inputs. In this +paper, we introduce a scalable neural architecture and loss function dedicated +to learning the constraints and criteria of NP-hard reasoning problems +expressed as discrete Graphical Models. Our loss function solves one of the +main limitations of Besag's pseudo-loglikelihood, enabling learning of high +energies. We empirically show it is able to efficiently learn how to solve +NP-hard reasoning problems from natural inputs as the symbolic, visual or +many-solutions Sudoku problems as well as the energy optimization formulation +of the protein design problem, providing data efficiency, interpretability, and +\textit{a posteriori} control over predictions. + +
+
+ comment: 10 pages, 2 figures, 6 tables. Published in IJCAI'2023 proceedings +
+
+
+
+
+ + ♻ ☆ CB-HVTNet: A channel-boosted hybrid vision transformer network for + lymphocyte assessment in histopathological images + + +
+ Transformers, due to their ability to learn long range dependencies, have +overcome the shortcomings of convolutional neural networks (CNNs) for global +perspective learning. Therefore, they have gained the focus of researchers for +several vision related tasks including medical diagnosis. However, their +multi-head attention module only captures global level feature representations, +which is insufficient for medical images. To address this issue, we propose a +Channel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning +to generate boosted channels and employs both transformers and CNNs to analyse +lymphocytes in histopathological images. The proposed CB HVT comprises five +modules, including a channel generation module, channel exploitation module, +channel merging module, region-aware module, and a detection and segmentation +head, which work together to effectively identify lymphocytes. The channel +generation module uses the idea of channel boosting through transfer learning +to extract diverse channels from different auxiliary learners. In the CB HVT, +these boosted channels are first concatenated and ranked using an attention +mechanism in the channel exploitation module. A fusion block is then utilized +in the channel merging module for a gradual and systematic merging of the +diverse boosted channels to improve the network's learning representations. The +CB HVT also employs a proposal network in its region aware module and a head to +effectively identify objects, even in overlapping regions and with artifacts. +We evaluated the proposed CB HVT on two publicly available datasets for +lymphocyte assessment in histopathological images. The results show that CB HVT +outperformed other state of the art detection models, and has good +generalization ability, demonstrating its value as a tool for pathologists. + +
+
+
+
+
+ + ♻ ☆ Revisiting the Robustness of the Minimum Error Entropy Criterion: A + Transfer Learning Case Study ECAI-23 + + +
+ Coping with distributional shifts is an important part of transfer learning +methods in order to perform well in real-life tasks. However, most of the +existing approaches in this area either focus on an ideal scenario in which the +data does not contain noises or employ a complicated training paradigm or model +design to deal with distributional shifts. In this paper, we revisit the +robustness of the minimum error entropy (MEE) criterion, a widely used +objective in statistical signal processing to deal with non-Gaussian noises, +and investigate its feasibility and usefulness in real-life transfer learning +regression tasks, where distributional shifts are common. Specifically, we put +forward a new theoretical result showing the robustness of MEE against +covariate shift. We also show that by simply replacing the mean squared error +(MSE) loss with the MEE on basic transfer learning algorithms such as +fine-tuning and linear probing, we can achieve competitive performance with +respect to state-of-the-art transfer learning algorithms. We justify our +arguments on both synthetic data and 5 real-world time-series data. + +
+
+ comment: Manuscript accepted at ECAI-23. Code available at + https://github.com/lpsilvestrin/mee-finetune +
+
+
+
+
+ + ♻ ☆ Generalizable Classification of UHF Partial Discharge Signals in + Gas-Insulated HVDC Systems Using Neural Networks + + +
+ Undetected partial discharges (PDs) are a safety critical issue in high +voltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC +voltage is well-established, the analysis of PDs under DC voltage remains an +active research field. A key focus of these investigations is the +classification of different PD sources to enable subsequent sophisticated +analysis. + In this paper, we propose and analyze a neural network-based approach for +classifying PD signals caused by metallic protrusions and conductive particles +on the insulator of HVDC GIS, without relying on pulse sequence analysis +features. In contrast to previous approaches, our proposed model can +discriminate the studied PD signals obtained at negative and positive +potentials, while also generalizing to unseen operating voltage multiples. +Additionally, we compare the performance of time- and frequency-domain input +signals and explore the impact of different normalization schemes to mitigate +the influence of free-space path loss between the sensor and defect location. + +
+
+ comment: 8 pages, submitted to IEEE Transactions on Power Delivery +
+
+
+
+
+ + ♻ ☆ Frouros: A Python library for drift detection in machine learning + systems + + +
+ Frouros is an open-source Python library capable of detecting drift in +machine learning systems. It provides a combination of classical and more +recent algorithms for drift detection: both concept and data drift. We have +designed it with the objective of making it compatible with any machine +learning framework and easily adaptable to real-world use cases. The library is +developed following a set of best development and continuous integration +practices to ensure ease of maintenance and extensibility. The source code is +available at https://github.com/IFCA/frouros. + +
+
+ comment: 11 pages, 1 table +
+
+
+
+
+ + ♻ ☆ On-device modeling of user's social context and familiar places from + smartphone-embedded sensor data + + +
+ Context modeling and recognition represent complex tasks that allow mobile +and ubiquitous computing applications to adapt to the user's situation. Current +solutions mainly focus on limited context information generally processed on +centralized architectures, potentially exposing users' personal data to privacy +leakage, and missing personalization features. For these reasons on-device +context modeling and recognition represent the current research trend in this +area. Among the different information characterizing the user's context in +mobile environments, social interactions and visited locations remarkably +contribute to the characterization of daily life scenarios. In this paper we +propose a novel, unsupervised and lightweight approach to model the user's +social context and her locations based on ego networks directly on the user +mobile device. Relying on this model, the system is able to extract high-level +and semantic-rich context features from smartphone-embedded sensors data. +Specifically, for the social context it exploits data related to both physical +and cyber social interactions among users and their devices. As far as location +context is concerned, we assume that it is more relevant to model the +familiarity degree of a specific location for the user's context than the raw +location data, both in terms of GPS coordinates and proximity devices. By using +5 real-world datasets, we assess the structure of the social and location ego +networks, we provide a semantic evaluation of the proposed models and a +complexity evaluation in terms of mobile computing performance. Finally, we +demonstrate the relevance of the extracted features by showing the performance +of 3 machine learning algorithms to recognize daily-life situations, obtaining +an improvement of 3% of AUROC, 9% of Precision, and 5% in terms of Recall with +respect to use only features related to physical context. + +
+
+ comment: I request the withdrawal of the paper because it has been already + submitted (and published) on arXiv with identifier 2306.15437 +
+
+
+
+
+ + ♻ ☆ From random-walks to graph-sprints: a low-latency node embedding + framework on continuous-time dynamic graphs + + +
+ Many real-world datasets have an underlying dynamic graph structure, where +entities and their interactions evolve over time. Machine learning models +should consider these dynamics in order to harness their full potential in +downstream tasks. Previous approaches for graph representation learning have +focused on either sampling k-hop neighborhoods, akin to breadth-first search, +or random walks, akin to depth-first search. However, these methods are +computationally expensive and unsuitable for real-time, low-latency inference +on dynamic graphs. To overcome these limitations, we propose graph-sprints a +general purpose feature extraction framework for continuous-time-dynamic-graphs +(CTDGs) that has low latency and is competitive with state-of-the-art, higher +latency models. To achieve this, a streaming, low latency approximation to the +random-walk based features is proposed. In our framework, time-aware node +embeddings summarizing multi-hop information are computed using only single-hop +operations on the incoming edges. We evaluate our proposed approach on three +open-source datasets and two in-house datasets, and compare with three +state-of-the-art algorithms (TGN-attn, TGN-ID, Jodie). We demonstrate that our +graph-sprints features, combined with a machine learning classifier, achieve +competitive performance (outperforming all baselines for the node +classification tasks in five datasets). Simultaneously, graph-sprints +significantly reduce inference latencies, achieving close to an order of +magnitude speed-up in our experimental setting. + +
+
+ comment: 9 pages, 5 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Resource frugal optimizer for quantum machine learning + + +
+ Quantum-enhanced data science, also known as quantum machine learning (QML), +is of growing interest as an application of near-term quantum computers. +Variational QML algorithms have the potential to solve practical problems on +real hardware, particularly when involving quantum data. However, training +these algorithms can be challenging and calls for tailored optimization +procedures. Specifically, QML applications can require a large shot-count +overhead due to the large datasets involved. In this work, we advocate for +simultaneous random sampling over both the dataset as well as the measurement +operators that define the loss function. We consider a highly general loss +function that encompasses many QML applications, and we show how to construct +an unbiased estimator of its gradient. This allows us to propose a shot-frugal +gradient descent optimizer called Refoqus (REsource Frugal Optimizer for +QUantum Stochastic gradient descent). Our numerics indicate that Refoqus can +save several orders of magnitude in shot cost, even relative to optimizers that +sample over measurement operators alone. + +
+
+ comment: 22 pages, 6 figures - extra quantum autoencoder results added +
+
+
+
+
+ + ♻ ☆ Secrets of RLHF in Large Language Models Part I: PPO + + +
+ Large language models (LLMs) have formulated a blueprint for the advancement +of artificial general intelligence. Its primary objective is to function as a +human-centric (helpful, honest, and harmless) assistant. Alignment with humans +assumes paramount significance, and reinforcement learning with human feedback +(RLHF) emerges as the pivotal technological paradigm underpinning this pursuit. +Current technical routes usually include \textbf{reward models} to measure +human preferences, \textbf{Proximal Policy Optimization} (PPO) to optimize +policy model outputs, and \textbf{process supervision} to improve step-by-step +reasoning capabilities. However, due to the challenges of reward design, +environment interaction, and agent training, coupled with huge trial and error +cost of large language models, there is a significant barrier for AI +researchers to motivate the development of technical alignment and safe landing +of LLMs. The stable training of RLHF has still been a puzzle. In the first +report, we dissect the framework of RLHF, re-evaluate the inner workings of +PPO, and explore how the parts comprising PPO algorithms impact policy agent +training. We identify policy constraints being the key factor for the effective +implementation of the PPO algorithm. Therefore, we explore the PPO-max, an +advanced version of PPO algorithm, to efficiently improve the training +stability of the policy model. Based on our main results, we perform a +comprehensive analysis of RLHF abilities compared with SFT models and ChatGPT. +The absence of open-source implementations has posed significant challenges to +the investigation of LLMs alignment. Therefore, we are eager to release +technical reports, reward models and PPO codes, aiming to make modest +contributions to the advancement of LLMs. + +
+
+
+
+
+ + ♻ ☆ Exploiting Noise as a Resource for Computation and Learning in Spiking + Neural Networks + + +
+ Networks of spiking neurons underpin the extraordinary information-processing +capabilities of the brain and have become pillar models in neuromorphic +artificial intelligence. Despite extensive research on spiking neural networks +(SNNs), most studies are established on deterministic models, overlooking the +inherent non-deterministic, noisy nature of neural computations. This study +introduces the noisy spiking neural network (NSNN) and the noise-driven +learning rule (NDL) by incorporating noisy neuronal dynamics to exploit the +computational advantages of noisy neural processing. NSNN provides a +theoretical framework that yields scalable, flexible, and reliable computation. +We demonstrate that NSNN leads to spiking neural models with competitive +performance, improved robustness against challenging perturbations than +deterministic SNNs, and better reproducing probabilistic neural computation in +neural coding. This study offers a powerful and easy-to-use tool for machine +learning, neuromorphic intelligence practitioners, and computational +neuroscience researchers. + +
+
+
+
+
+ + ♻ ☆ Robust Counterfactual Explanations for Neural Networks With + Probabilistic Guarantees ICML + + +
+ There is an emerging interest in generating robust counterfactual +explanations that would remain valid if the model is updated or changed even +slightly. Towards finding robust counterfactuals, existing literature often +assumes that the original model $m$ and the new model $M$ are bounded in the +parameter space, i.e., $\|\text{Params}(M){-}\text{Params}(m)\|{<}\Delta$. +However, models can often change significantly in the parameter space with +little to no change in their predictions or accuracy on the given dataset. In +this work, we introduce a mathematical abstraction termed +\emph{naturally-occurring} model change, which allows for arbitrary changes in +the parameter space such that the change in predictions on points that lie on +the data manifold is limited. Next, we propose a measure -- that we call +\emph{Stability} -- to quantify the robustness of counterfactuals to potential +model changes for differentiable models, e.g., neural networks. Our main +contribution is to show that counterfactuals with sufficiently high value of +\emph{Stability} as defined by our measure will remain valid after potential +``naturally-occurring'' model changes with high probability (leveraging +concentration bounds for Lipschitz function of independent Gaussians). Since +our quantification depends on the local Lipschitz constant around a data point +which is not always available, we also examine practical relaxations of our +proposed measure and demonstrate experimentally how they can be incorporated to +find robust counterfactuals for neural networks that are close, realistic, and +remain valid after potential model changes. This work also has interesting +connections with model multiplicity, also known as, the Rashomon effect. + +
+
+ comment: International Conference on Machine Learning (ICML), 2023 +
+
+
+
+
+ + ♻ ☆ Continuous Monte Carlo Graph Search + + +
+ In many complex sequential decision-making tasks, online planning is crucial +for high performance. For efficient online planning, Monte Carlo Tree Search +(MCTS) employs a principled mechanism for trading off exploration for +exploitation. MCTS outperforms comparison methods in many discrete +decision-making domains such as Go, Chess, and Shogi. Following, extensions of +MCTS to continuous domains have been proposed. However, the inherent high +branching factor and the resulting explosion of search tree size are limiting +existing methods. To address this problem, we propose Continuous Monte Carlo +Graph Search (CMCGS), a novel extension of MCTS to online planning in +environments with continuous state and action spaces. CMCGS takes advantage of +the insight that, during planning, sharing the same action policy between +several states can yield high performance. To implement this idea, at each time +step, CMCGS clusters similar states into a limited number of stochastic action +bandit nodes, which produce a layered directed graph instead of an MCTS search +tree. Experimental evaluation shows that CMCGS outperforms comparable planning +methods in several complex continuous DeepMind Control Suite benchmarks and a +2D navigation task with limited sample budgets. Furthermore, CMCGS can be +parallelized to scale up and it outperforms the Cross-Entropy Method (CEM) in +continuous control with learned dynamics models. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Mitigating Adversarial Vulnerability through Causal Parameter Estimation + by Adversarial Double Machine Learning ICCV 2023 + + +
+ Adversarial examples derived from deliberately crafted perturbations on +visual inputs can easily harm decision process of deep neural networks. To +prevent potential threats, various adversarial training-based defense methods +have grown rapidly and become a de facto standard approach for robustness. +Despite recent competitive achievements, we observe that adversarial +vulnerability varies across targets and certain vulnerabilities remain +prevalent. Intriguingly, such peculiar phenomenon cannot be relieved even with +deeper architectures and advanced defense methods. To address this issue, in +this paper, we introduce a causal approach called Adversarial Double Machine +Learning (ADML), which allows us to quantify the degree of adversarial +vulnerability for network predictions and capture the effect of treatments on +outcome of interests. ADML can directly estimate causal parameter of +adversarial perturbations per se and mitigate negative effects that can +potentially damage robustness, bridging a causal perspective into the +adversarial vulnerability. Through extensive experiments on various CNN and +Transformer architectures, we corroborate that ADML improves adversarial +robustness with large margins and relieve the empirical observation. + +
+
+ comment: Accepted in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Best-of-three-worlds Analysis for Linear Bandits with + Follow-the-regularized-leader Algorithm COLT 2023 + + +
+ The linear bandit problem has been studied for many years in both stochastic +and adversarial settings. Designing an algorithm that can optimize the +environment without knowing the loss type attracts lots of interest. +\citet{LeeLWZ021} propose an algorithm that actively detects the loss type and +then switches between different algorithms specially designed for specific +settings. However, such an approach requires meticulous designs to perform well +in all environments. Follow-the-regularized-leader (FTRL) is another type of +popular algorithm that can adapt to different environments. This algorithm is +of simple design and the regret bounds are shown to be optimal in traditional +multi-armed bandit problems compared with the detect-switch type. Designing an +FTRL-type algorithm for linear bandits is an important question that has been +open for a long time. In this paper, we prove that the FTRL algorithm with a +negative entropy regularizer can achieve the best-of-three-world results for +the linear bandit problem. Our regret bounds achieve the same or nearly the +same order as the previous detect-switch type algorithm but with a much simpler +algorithmic design. + +
+
+ comment: Accepted in COLT 2023 +
+
+
+
+
+ + ♻ ☆ Macro Placement by Wire-Mask-Guided Black-Box Optimization + + +
+ The development of very large-scale integration (VLSI) technology has posed +new challenges for electronic design automation (EDA) techniques in chip +floorplanning. During this process, macro placement is an important subproblem, +which tries to determine the positions of all macros with the aim of minimizing +half-perimeter wirelength (HPWL) and avoiding overlapping. Previous methods +include packing-based, analytical and reinforcement learning methods. In this +paper, we propose a new black-box optimization (BBO) framework (called +WireMask-BBO) for macro placement, by using a wire-mask-guided greedy procedure +for objective evaluation. Equipped with different BBO algorithms, WireMask-BBO +empirically achieves significant improvements over previous methods, i.e., +achieves significantly shorter HPWL by using much less time. Furthermore, it +can fine-tune existing placements by treating them as initial solutions, which +can bring up to 50% improvement in HPWL. WireMask-BBO has the potential to +significantly improve the quality and efficiency of chip floorplanning, which +makes it appealing to researchers and practitioners in EDA and will also +promote the application of BBO. + +
+
+ comment: Update Table1 number dislocation +
+
+
+
+
+ + ♻ ☆ Identifying TBI Physiological States by Clustering Multivariate Clinical + Time-Series Data + + +
+ Determining clinically relevant physiological states from multivariate time +series data with missing values is essential for providing appropriate +treatment for acute conditions such as Traumatic Brain Injury (TBI), +respiratory failure, and heart failure. Utilizing non-temporal clustering or +data imputation and aggregation techniques may lead to loss of valuable +information and biased analyses. In our study, we apply the SLAC-Time +algorithm, an innovative self-supervision-based approach that maintains data +integrity by avoiding imputation or aggregation, offering a more useful +representation of acute patient states. By using SLAC-Time to cluster data in a +large research dataset, we identified three distinct TBI physiological states +and their specific feature profiles. We employed various clustering evaluation +metrics and incorporated input from a clinical domain expert to validate and +interpret the identified physiological states. Further, we discovered how +specific clinical events and interventions can influence patient states and +state transitions. + +
+
+ comment: 10 pages, 7 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ On the Robustness of Split Learning against Adversarial Attacks ECAI 2023 + + +
+ Split learning enables collaborative deep learning model training while +preserving data privacy and model security by avoiding direct sharing of raw +data and model details (i.e., sever and clients only hold partial sub-networks +and exchange intermediate computations). However, existing research has mainly +focused on examining its reliability for privacy protection, with little +investigation into model security. Specifically, by exploring full models, +attackers can launch adversarial attacks, and split learning can mitigate this +severe threat by only disclosing part of models to untrusted servers.This paper +aims to evaluate the robustness of split learning against adversarial attacks, +particularly in the most challenging setting where untrusted servers only have +access to the intermediate layers of the model.Existing adversarial attacks +mostly focus on the centralized setting instead of the collaborative setting, +thus, to better evaluate the robustness of split learning, we develop a +tailored attack called SPADV, which comprises two stages: 1) shadow model +training that addresses the issue of lacking part of the model and 2) local +adversarial attack that produces adversarial examples to evaluate.The first +stage only requires a few unlabeled non-IID data, and, in the second stage, +SPADV perturbs the intermediate output of natural samples to craft the +adversarial ones. The overall cost of the proposed attack process is relatively +low, yet the empirical attack effectiveness is significantly high, +demonstrating the surprising vulnerability of split learning to adversarial +attacks. + +
+
+ comment: accepted by ECAI 2023, camera-ready version +
+
+
+
+
+ + ♻ ☆ Unified Off-Policy Learning to Rank: a Reinforcement Learning + Perspective + + +
+ Off-policy Learning to Rank (LTR) aims to optimize a ranker from data +collected by a deployed logging policy. However, existing off-policy learning +to rank methods often make strong assumptions about how users generate the +click data, i.e., the click model, and hence need to tailor their methods +specifically under different click models. In this paper, we unified the +ranking process under general stochastic click models as a Markov Decision +Process (MDP), and the optimal ranking could be learned with offline +reinforcement learning (RL) directly. Building upon this, we leverage offline +RL techniques for off-policy LTR and propose the Click Model-Agnostic Unified +Off-policy Learning to Rank (CUOLR) method, which could be easily applied to a +wide range of click models. Through a dedicated formulation of the MDP, we show +that offline RL algorithms can adapt to various click models without complex +debiasing techniques and prior knowledge of the model. Results on various +large-scale datasets demonstrate that CUOLR consistently outperforms the +state-of-the-art off-policy learning to rank algorithms while maintaining +consistency and robustness under different click models. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Mean Field Games with non-separable Hamiltonians + + +
+ This paper introduces a new method based on Deep Galerkin Methods (DGMs) for +solving high-dimensional stochastic Mean Field Games (MFGs). We achieve this by +using two neural networks to approximate the unknown solutions of the MFG +system and forward-backward conditions. Our method is efficient, even with a +small number of iterations, and is capable of handling up to 300 dimensions +with a single layer, which makes it faster than other approaches. In contrast, +methods based on Generative Adversarial Networks (GANs) cannot solve MFGs with +non-separable Hamiltonians. We demonstrate the effectiveness of our approach by +applying it to a traffic flow problem, which was previously solved using the +Newton iteration method only in the deterministic case. We compare the results +of our method to analytical solutions and previous approaches, showing its +efficiency. We also prove the convergence of our neural network approximation +with a single hidden layer using the universal approximation theorem. + +
+
+
+
+
+ + ♻ ☆ An R package for parametric estimation of causal effects + + +
+ This article explains the usage of R package CausalModels, which is publicly +available on the Comprehensive R Archive Network. While packages are available +for sufficiently estimating causal effects, there lacks a package that provides +a collection of structural models using the conventional statistical approach +developed by Hernan and Robins (2020). CausalModels addresses this deficiency +of software in R concerning causal inference by offering tools for methods that +account for biases in observational data without requiring extensive +statistical knowledge. These methods should not be ignored and may be more +appropriate or efficient in solving particular problems. While implementations +of these statistical models are distributed among a number of causal packages, +CausalModels introduces a simple and accessible framework for a consistent +modeling pipeline among a variety of statistical methods for estimating causal +effects in a single R package. It consists of common methods including +standardization, IP weighting, G-estimation, outcome regression, instrumental +variables and propensity matching. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of the Effectiveness of Using a Replay Buffer on Mode + Discovery in GFlowNets ICML 2023 + + +
+ Reinforcement Learning (RL) algorithms aim to learn an optimal policy by +iteratively sampling actions to learn how to maximize the total expected +return, $R(x)$. GFlowNets are a special class of algorithms designed to +generate diverse candidates, $x$, from a discrete set, by learning a policy +that approximates the proportional sampling of $R(x)$. GFlowNets exhibit +improved mode discovery compared to conventional RL algorithms, which is very +useful for applications such as drug discovery and combinatorial search. +However, since GFlowNets are a relatively recent class of algorithms, many +techniques which are useful in RL have not yet been associated with them. In +this paper, we study the utilization of a replay buffer for GFlowNets. We +explore empirically various replay buffer sampling techniques and assess the +impact on the speed of mode discovery and the quality of the modes discovered. +Our experimental results in the Hypergrid toy domain and a molecule synthesis +environment demonstrate significant improvements in mode discovery when +training with a replay buffer, compared to training only with trajectories +generated on-policy. + +
+
+ comment: Accepted to ICML 2023 workshop on Structured Probabilistic Inference + & Generative Modeling +
+
+
+
+
+ + ♻ ☆ DESCN: Deep Entire Space Cross Networks for Individual Treatment Effect + Estimation KDD 2022 + + +
+ Causal Inference has wide applications in various areas such as E-commerce +and precision medicine, and its performance heavily relies on the accurate +estimation of the Individual Treatment Effect (ITE). Conventionally, ITE is +predicted by modeling the treated and control response functions separately in +their individual sample spaces. However, such an approach usually encounters +two issues in practice, i.e. divergent distribution between treated and control +groups due to treatment bias, and significant sample imbalance of their +population sizes. This paper proposes Deep Entire Space Cross Networks (DESCN) +to model treatment effects from an end-to-end perspective. DESCN captures the +integrated information of the treatment propensity, the response, and the +hidden treatment effect through a cross network in a multi-task learning +manner. Our method jointly learns the treatment and response functions in the +entire sample space to avoid treatment bias and employs an intermediate pseudo +treatment effect prediction network to relieve sample imbalance. Extensive +experiments are conducted on a synthetic dataset and a large-scaled production +dataset from the E-commerce voucher distribution business. The results indicate +that DESCN can successfully enhance the accuracy of ITE estimation and improve +the uplift ranking performance. A sample of the production dataset and the +source code are released to facilitate future research in the community, which +is, to the best of our knowledge, the first large-scale public biased treatment +dataset for causal inference. + +
+
+ comment: Accepted by SIGKDD 2022 Applied Data Science Track +
+
+
+
+
+ + ♻ ☆ Nonuniqueness and Convergence to Equivalent Solutions in Observer-based + Inverse Reinforcement Learning + + +
+ A key challenge in solving the deterministic inverse reinforcement learning +(IRL) problem online and in real-time is the existence of multiple solutions. +Nonuniqueness necessitates the study of the notion of equivalent solutions, +i.e., solutions that result in a different cost functional but same feedback +matrix, and convergence to such solutions. While offline algorithms that result +in convergence to equivalent solutions have been developed in the literature, +online, real-time techniques that address nonuniqueness are not available. In +this paper, a regularized history stack observer that converges to +approximately equivalent solutions of the IRL problem is developed. Novel +data-richness conditions are developed to facilitate the analysis and +simulation results are provided to demonstrate the effectiveness of the +developed technique. + +
+
+ comment: 16 pages, 7 figures, submitted to American Controls Conference 2023 +
+
+
+
+
+ + ♻ ☆ Multi-class Graph Clustering via Approximated Effective $p$-Resistance ICML2023 + + +
+ This paper develops an approximation to the (effective) $p$-resistance and +applies it to multi-class clustering. Spectral methods based on the graph +Laplacian and its generalization to the graph $p$-Laplacian have been a +backbone of non-euclidean clustering techniques. The advantage of the +$p$-Laplacian is that the parameter $p$ induces a controllable bias on cluster +structure. The drawback of $p$-Laplacian eigenvector based methods is that the +third and higher eigenvectors are difficult to compute. Thus, instead, we are +motivated to use the $p$-resistance induced by the $p$-Laplacian for +clustering. For $p$-resistance, small $p$ biases towards clusters with high +internal connectivity while large $p$ biases towards clusters of small +"extent," that is a preference for smaller shortest-path distances between +vertices in the cluster. However, the $p$-resistance is expensive to compute. +We overcome this by developing an approximation to the $p$-resistance. We prove +upper and lower bounds on this approximation and observe that it is exact when +the graph is a tree. We also provide theoretical justification for the use of +$p$-resistance for clustering. Finally, we provide experiments comparing our +approximated $p$-resistance clustering to other $p$-Laplacian based methods. + +
+
+ comment: Accepted to ICML2023 +
+
+
+
+
+ + ♻ ☆ Declarative Mechanism Design + + +
+ Regulation of Multi-Agent Systems (MAS) and Declarative Electronic +Institutions (DEIs) was a multidisciplinary research topic of the past decade +involving (Physical and Software) Agents and Law since the beginning, but +recently evolved towards News-claimed Robot Lawyer since 2016. One of these +first proposals of restricting the behaviour of Software Agentswas Electronic +Institutions.However, with the recent reformulation of Artificial Neural +Networks (ANNs) as Deep Learning (DL), Security, Privacy,Ethical and Legal +issues regarding the use of DL has raised concerns in the Artificial +Intelligence (AI) Community. Now that the Regulation of MAS is almost correctly +addressed, we propose the Regulation of Artificial Neural Networks as +Agent-based Training of a special type of regulated Artificial Neural Network +that we call Institutional Neural Network (INN).The main purpose of this paper +is to bring attention to Artificial Teaching (AT) and to give a tentative +answer showing a proof-of-concept implementation of Regulated Deep Learning +(RDL). This paper introduces the former concept and provide sI, a language +previously used to model declaratively and extend Electronic Institutions, as a +means to regulate the execution of Artificial Neural Networks and their +interactions with Artificial Teachers (ATs) + +
+
+
+
+
+ + ♻ ☆ Contagion Effect Estimation Using Proximal Embeddings + + +
+ Contagion effect refers to the causal effect of peers' behavior on the +outcome of an individual in social networks. While prominent methods for +estimating contagion effects in observational studies often assume that there +are no unmeasured confounders, contagion can be confounded due to latent +homophily: nodes in a homophilous network tend to have ties to peers with +similar attributes and can behave similarly without influencing one another. +One way to account for latent homophily is by considering proxies for the +unobserved confounders. However, in the presence of high-dimensional proxies, +proxy-based methods can lead to substantially biased estimation of contagion +effects, as we demonstrate in this paper. To tackle this issue, we introduce +the novel Proximal Embeddings (ProEmb), a framework which integrates +Variational Autoencoders (VAEs) and adversarial networks to generate balanced +low-dimensional representations of high-dimensional proxies for different +treatment groups and identifies contagion effects in the presence of unobserved +network confounders. We empirically show that our method significantly +increases the accuracy of contagion effect estimation in observational network +data compared to state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Robust Field-level Likelihood-free Inference with Galaxies + + +
+ We train graph neural networks to perform field-level likelihood-free +inference using galaxy catalogs from state-of-the-art hydrodynamic simulations +of the CAMELS project. Our models are rotational, translational, and +permutation invariant and do not impose any cut on scale. From galaxy catalogs +that only contain $3$D positions and radial velocities of $\sim 1, 000$ +galaxies in tiny $(25~h^{-1}{\rm Mpc})^3$ volumes our models can infer the +value of $\Omega_{\rm m}$ with approximately $12$ % precision. More +importantly, by testing the models on galaxy catalogs from thousands of +hydrodynamic simulations, each having a different efficiency of supernova and +AGN feedback, run with five different codes and subgrid models - IllustrisTNG, +SIMBA, Astrid, Magneticum, SWIFT-EAGLE -, we find that our models are robust to +changes in astrophysics, subgrid physics, and subhalo/galaxy finder. +Furthermore, we test our models on $1,024$ simulations that cover a vast region +in parameter space - variations in $5$ cosmological and $23$ astrophysical +parameters - finding that the model extrapolates really well. Our results +indicate that the key to building a robust model is the use of both galaxy +positions and velocities, suggesting that the network have likely learned an +underlying physical relation that does not depend on galaxy formation and is +valid on scales larger than $\sim10~h^{-1}{\rm kpc}$. + +
+
+ comment: 34 pages, 12 figures. For a video summarizing the results, see + https://youtu.be/b59ep7cyPOs +
+
+
+
+
+ + ♻ ☆ Neuro-symbolic Empowered Denoising Diffusion Probabilistic Models for + Real-time Anomaly Detection in Industry 4.0 + + +
+ Industry 4.0 involves the integration of digital technologies, such as IoT, +Big Data, and AI, into manufacturing and industrial processes to increase +efficiency and productivity. As these technologies become more interconnected +and interdependent, Industry 4.0 systems become more complex, which brings the +difficulty of identifying and stopping anomalies that may cause disturbances in +the manufacturing process. This paper aims to propose a diffusion-based model +for real-time anomaly prediction in Industry 4.0 processes. Using a +neuro-symbolic approach, we integrate industrial ontologies in the model, +thereby adding formal knowledge on smart manufacturing. Finally, we propose a +simple yet effective way of distilling diffusion models through Random Fourier +Features for deployment on an embedded system for direct integration into the +manufacturing process. To the best of our knowledge, this approach has never +been explored before. + +
+
+ comment: Accepted at the 26th Forum on specification and Design Languages (FDL + 2023) +
+
+
+
+
+ + ♻ ☆ Strong Optimal Classification Trees + + +
+ Decision trees are among the most popular machine learning models and are +used routinely in applications ranging from revenue management and medicine to +bioinformatics. In this paper, we consider the problem of learning optimal +binary classification trees with univariate splits. Literature on the topic has +burgeoned in recent years, motivated both by the empirical suboptimality of +heuristic approaches and the tremendous improvements in mixed-integer +optimization (MIO) technology. Yet, existing MIO-based approaches from the +literature do not leverage the power of MIO to its full extent: they rely on +weak formulations, resulting in slow convergence and large optimality gaps. To +fill this gap in the literature, we propose an intuitive flow-based MIO +formulation for learning optimal binary classification trees. Our formulation +can accommodate side constraints to enable the design of interpretable and fair +decision trees. Moreover, we show that our formulation has a stronger linear +optimization relaxation than existing methods in the case of binary data. We +exploit the decomposable structure of our formulation and max-flow/min-cut +duality to derive a Benders' decomposition method to speed-up computation. We +propose a tailored procedure for solving each decomposed subproblem that +provably generates facets of the feasible set of the MIO as constraints to add +to the main problem. We conduct extensive computational experiments on standard +benchmark datasets on which we show that our proposed approaches are 29 times +faster than state-of-the-art MIO-based techniques and improve out-of-sample +performance by up to 8%. + +
+
+
+
+
+ + ♻ ☆ MAFAT: Memory-Aware Fusing and Tiling of Neural Networks for Accelerated + Edge Inference + + +
+ A rising research challenge is running costly machine learning (ML) networks +locally on resource-constrained edge devices. ML networks with large +convolutional layers can easily exceed available memory, increasing latency due +to excessive OS swapping. Previous memory reduction techniques such as pruning +and quantization reduce model accuracy and often require retraining. +Alternatively, distributed methods partition the convolutions into equivalent +smaller sub-computations, but the implementations introduce communication costs +and require a network of devices. Distributed partitioning approaches can, +however, also be used to run in a reduced memory footprint on a single device +by subdividing the network into smaller operations. In this paper, we extend +prior work on distributed partitioning into a memory-aware execution on a +single device. Our approach extends prior fusing strategies to allow for +multiple groups of convolutional layers that are fused and tiled independently. +This enables trading off overhead versus data reuse in order to specifically +reduces memory footprint. We propose a memory usage predictor coupled with a +search algorithm to provide optimized fusing and tiling configurations for an +arbitrary set of convolutional layers. When applied to the YOLOv2 object +detection network, results show that our approach can run in less than half the +memory, and with a speedup of up to 2.78 under severe memory constraints. +Additionally, our algorithm will return a configuration with a latency that is +within 6% of the best latency measured in a manual search. + +
+
+
+
+
+ + ♻ ☆ Emergent Asymmetry of Precision and Recall for Measuring Fidelity and + Diversity of Generative Models in High Dimensions ICML 2023 + + +
+ Precision and Recall are two prominent metrics of generative performance, +which were proposed to separately measure the fidelity and diversity of +generative models. Given their central role in comparing and improving +generative models, understanding their limitations are crucially important. To +that end, in this work, we identify a critical flaw in the common approximation +of these metrics using k-nearest-neighbors, namely, that the very +interpretations of fidelity and diversity that are assigned to Precision and +Recall can fail in high dimensions, resulting in very misleading conclusions. +Specifically, we empirically and theoretically show that as the number of +dimensions grows, two model distributions with supports at equal point-wise +distance from the support of the real distribution, can have vastly different +Precision and Recall regardless of their respective distributions, hence an +emergent asymmetry in high dimensions. Based on our theoretical insights, we +then provide simple yet effective modifications to these metrics to construct +symmetric metrics regardless of the number of dimensions. Finally, we provide +experiments on real-world datasets to illustrate that the identified flaw is +not merely a pathological case, and that our proposed metrics are effective in +alleviating its impact. + +
+
+ comment: To appear in ICML 2023. Updated proof in Appendix B +
+
+
+
+
+ + ♻ ☆ Prediction intervals for neural network models using weighted asymmetric + loss functions + + +
+ We propose a simple and efficient approach to generate a prediction intervals +(PI) for approximated and forecasted trends. Our method leverages a weighted +asymmetric loss function to estimate the lower and upper bounds of the PI, with +the weights determined by its coverage probability. We provide a concise +mathematical proof of the method, show how it can be extended to derive PIs for +parametrised functions and discuss its effectiveness when training deep neural +networks. The presented tests of the method on a real-world forecasting task +using a neural network-based model show that it can produce reliable PIs in +complex machine learning scenarios. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Primal Estimated Subgradient Solver for SVM for Imbalanced + Classification + + +
+ We aim to demonstrate in experiments that our cost sensitive PEGASOS SVM +achieves good performance on imbalanced data sets with a Majority to Minority +Ratio ranging from 8.6:1 to 130:1 and to ascertain whether the including +intercept (bias), regularization and parameters affects performance on our +selection of datasets. Although many resort to SMOTE methods, we aim for a less +computationally intensive method. We evaluate the performance by examining the +learning curves. These curves diagnose whether we overfit or underfit or +whether the random sample of data chosen during the process was not random +enough or diverse enough in dependent variable class for the algorithm to +generalized to unseen examples. We will also see the background of the +hyperparameters versus the test and train error in validation curves. We +benchmark our PEGASOS Cost-Sensitive SVM's results of Ding's LINEAR SVM DECIDL +method. He obtained an ROC-AUC of .5 in one dataset. Our work will extend the +work of Ding by incorporating kernels into SVM. We will use Python rather than +MATLAB as python has dictionaries for storing mixed data types during +multi-parameter cross-validation. + +
+
+ comment: 10 pages, 4 tables, 3 figures +
+
+
+
+
+ + ♻ ☆ A Probabilistic Transformation of Distance-Based Outliers + + +
+ The scores of distance-based outlier detection methods are difficult to +interpret, making it challenging to determine a cut-off threshold between +normal and outlier data points without additional context. We describe a +generic transformation of distance-based outlier scores into interpretable, +probabilistic estimates. The transformation is ranking-stable and increases the +contrast between normal and outlier data points. Determining distance +relationships between data points is necessary to identify the nearest-neighbor +relationships in the data, yet, most of the computed distances are typically +discarded. We show that the distances to other data points can be used to model +distance probability distributions and, subsequently, use the distributions to +turn distance-based outlier scores into outlier probabilities. Our experiments +show that the probabilistic transformation does not impact detection +performance over numerous tabular and image benchmark datasets but results in +interpretable outlier scores with increased contrast between normal and outlier +samples. Our work generalizes to a wide range of distance-based outlier +detection methods, and because existing distance computations are used, it adds +no significant computational overhead. + +
+
+
+
+
+ + ♻ ☆ Learning from time-dependent streaming data with online stochastic + algorithms + + +
+ This paper addresses stochastic optimization in a streaming setting with +time-dependent and biased gradient estimates. We analyze several first-order +methods, including Stochastic Gradient Descent (SGD), mini-batch SGD, and +time-varying mini-batch SGD, along with their Polyak-Ruppert averages. Our +non-asymptotic analysis establishes novel heuristics that link dependence, +biases, and convexity levels, enabling accelerated convergence. Specifically, +our findings demonstrate that (i) time-varying mini-batch SGD methods have the +capability to break long- and short-range dependence structures, (ii) biased +SGD methods can achieve comparable performance to their unbiased counterparts, +and (iii) incorporating Polyak-Ruppert averaging can accelerate the convergence +of the stochastic optimization algorithms. To validate our theoretical +findings, we conduct a series of experiments using both simulated and real-life +time-dependent data. + +
+
+
+
+
+ + ♻ ☆ Active Learning for Single Neuron Models with Lipschitz Non-Linearities + + +
+ We consider the problem of active learning for single neuron models, also +sometimes called ``ridge functions'', in the agnostic setting (under +adversarial label noise). Such models have been shown to be broadly effective +in modeling physical phenomena, and for constructing surrogate data-driven +models for partial differential equations. + Surprisingly, we show that for a single neuron model with any Lipschitz +non-linearity (such as the ReLU, sigmoid, absolute value, low-degree +polynomial, among others), strong provable approximation guarantees can be +obtained using a well-known active learning strategy for fitting \emph{linear +functions} in the agnostic setting. % -- i.e. for the case when there is no +non-linearity. Namely, we can collect samples via statistical \emph{leverage +score sampling}, which has been shown to be near-optimal in other active +learning scenarios. We support our theoretical results with empirical +simulations showing that our proposed active learning strategy based on +leverage score sampling outperforms (ordinary) uniform sampling when fitting +single neuron models. + +
+
+ comment: Inadvertently submitting an incorrect writeup that does not align + with the intended content +
+
+
+
+
+ + ♻ Benchmarking Bayesian Causal Discovery Methods for Downstream Treatment + Effect Estimation ICML 2023 + + +
+ The practical utility of causality in decision-making is widespread and +brought about by the intertwining of causal discovery and causal inference. +Nevertheless, a notable gap exists in the evaluation of causal discovery +methods, where insufficient emphasis is placed on downstream inference. To +address this gap, we evaluate seven established baseline causal discovery +methods including a newly proposed method based on GFlowNets, on the downstream +task of treatment effect estimation. Through the implementation of a +distribution-level evaluation, we offer valuable and unique insights into the +efficacy of these causal discovery methods for treatment effect estimation, +considering both synthetic and real-world scenarios, as well as low-data +scenarios. The results of our study demonstrate that some of the algorithms +studied are able to effectively capture a wide range of useful and diverse ATE +modes, while some tend to learn many low-probability modes which impacts the +(unrelaxed) recall and precision. + +
+
+ comment: Peer-Reviewed and Accepted to ICML 2023 Workshop on Structured + Probabilistic Inference & Generative Modeling +
+
+
+
+
+ + ♻ ☆ Theory of Mind as Intrinsic Motivation for Multi-Agent Reinforcement + Learning ICML 2023 + + +
+ The ability to model the mental states of others is crucial to human social +intelligence, and can offer similar benefits to artificial agents with respect +to the social dynamics induced in multi-agent settings. We present a method of +grounding semantically meaningful, human-interpretable beliefs within policies +modeled by deep networks. We then consider the task of 2nd-order belief +prediction. We propose that ability of each agent to predict the beliefs of the +other agents can be used as an intrinsic reward signal for multi-agent +reinforcement learning. Finally, we present preliminary empirical results in a +mixed cooperative-competitive environment. + +
+
+ comment: To appear at ICML 2023 Workshop on Theory of Mind +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks. In contrast to traditional text-only methods, our approach to +labelling a comment as hate speech centers around the holistic analysis of text +and images. This is done by leveraging graph transformers to capture the +contextual relationships in the entire discussion that surrounds a comment, +with interwoven fusion layers to combine text and image embeddings instead of +processing different modalities separately. We compare the performance of our +model to baselines that only process text; we also conduct extensive ablation +studies. We conclude with future work for multimodal solutions to deliver +social value in online contexts, arguing that capturing a holistic view of a +conversation greatly advances the effort to detect anti-social behavior. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ☆ AI-assisted Improved Service Provisioning for Low-latency XR over 5G NR + + +
+ Extended Reality (XR) is one of the most important 5G/6G media applications +that will fundamentally transform human interactions. However, ensuring low +latency, high data rate, and reliability to support XR services poses +significant challenges. This letter presents a novel AI-assisted service +provisioning scheme that leverages predicted frames for processing rather than +relying solely on actual frames. This method virtually increases the network +delay budget and consequently improves service provisioning, albeit at the +expense of minor prediction errors. The proposed scheme is validated by +extensive simulations demonstrating a multi-fold increase in supported XR users +and also provides crucial network design insights. + +
+
+
+
+
+ + ☆ CSSL-RHA: Contrastive Self-Supervised Learning for Robust Handwriting + Authentication ACM MM 2023 + + +
+ Handwriting authentication is a valuable tool used in various fields, such as +fraud prevention and cultural heritage protection. However, it remains a +challenging task due to the complex features, severe damage, and lack of +supervision. In this paper, we propose a novel Contrastive Self-Supervised +Learning framework for Robust Handwriting Authentication (CSSL-RHA) to address +these issues. It can dynamically learn complex yet important features and +accurately predict writer identities. Specifically, to remove the negative +effects of imperfections and redundancy, we design an information-theoretic +filter for pre-processing and propose a novel adaptive matching scheme to +represent images as patches of local regions dominated by more important +features. Through online optimization at inference time, the most informative +patch embeddings are identified as the "most important" elements. Furthermore, +we employ contrastive self-supervised training with a momentum-based paradigm +to learn more general statistical structures of handwritten data without +supervision. We conduct extensive experiments on five benchmark datasets and +our manually annotated dataset EN-HA, which demonstrate the superiority of our +CSSL-RHA compared to baselines. Additionally, we show that our proposed model +can still effectively achieve authentication even under abnormal circumstances, +such as data falsification and corruption. + +
+
+ comment: 10 pages, 4 figures, 3 tables, submitted to ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Robustness Analysis of Video-Language Models Against Visual and Language + Perturbations NeurIPS 2022 + + +
+ Joint visual and language modeling on large-scale datasets has recently shown +good progress in multi-modal tasks when compared to single modal learning. +However, robustness of these approaches against real-world perturbations has +not been studied. In this work, we perform the first extensive robustness study +of video-language models against various real-world perturbations. We focus on +text-to-video retrieval and propose two large-scale benchmark datasets, +MSRVTT-P and YouCook2-P, which utilize 90 different visual and 35 different +text perturbations. The study reveals some interesting initial findings from +the studied models: 1) models are generally more susceptible when only video is +perturbed as opposed to when only text is perturbed, 2) models that are +pre-trained are more robust than those trained from scratch, 3) models attend +more to scene and objects rather than motion and action. We hope this study +will serve as a benchmark and guide future research in robust video-language +learning. The benchmark introduced in this study along with the code and +datasets is available at https://bit.ly/3CNOly4. + +
+
+ comment: NeurIPS 2022 Datasets and Benchmarks Track. This projects webpage is + located at https://bit.ly/3CNOly4 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 37 + +
+
+
+ + ☆ AlpaGasus: Training A Better Alpaca with Fewer Data + + +
+ Large language models~(LLMs) obtain instruction-following capability through +instruction-finetuning (IFT) on supervised instruction/response data. However, +widely used IFT datasets (e.g., Alpaca's 52k data) surprisingly contain many +low-quality instances with incorrect or irrelevant responses, which are +misleading and detrimental to IFT. In this paper, we propose a simple and +effective data selection strategy that automatically identifies and removes +low-quality data using a strong LLM (e.g., ChatGPT). To this end, we introduce +AlpaGasus, which is finetuned on only 9k high-quality data filtered from the +52k Alpaca data. AlpaGasus significantly outperforms the original Alpaca as +evaluated by GPT-4 on multiple test sets and its 13B variant matches $>90\%$ +performance of its teacher LLM (i.e., Text-Davinci-003) on test tasks. It also +provides 5.7x faster training, reducing the training time for a 7B variant from +80 minutes (for Alpaca) to 14 minutes \footnote{We apply IFT for the same +number of epochs as Alpaca(7B) but on fewer data, using 4$\times$NVIDIA A100 +(80GB) GPUs and following the original Alpaca setting and hyperparameters.}. +Overall, AlpaGasus demonstrates a novel data-centric IFT paradigm that can be +generally applied to instruction-tuning data, leading to faster training and +better instruction-following models. Our project page is available at: +\url{https://lichang-chen.github.io/AlpaGasus/}. + +
+
+ comment: 22 pages; 22 figures +
+
+
+
+
+ + ☆ COLLIE: Systematic Construction of Constrained Text Generation Tasks + + +
+ Text generation under constraints have seen increasing interests in natural +language processing, especially with the rapidly improving capabilities of +large language models. However, existing benchmarks for constrained generation +usually focus on fixed constraint types (e.g.,generate a sentence containing +certain words) that have proved to be easy for state-of-the-art models like +GPT-4. We present COLLIE, a grammar-based framework that allows the +specification of rich, compositional constraints with diverse generation levels +(word, sentence, paragraph, passage) and modeling challenges (e.g.,language +understanding, logical reasoning, counting, semantic planning). We also develop +tools for automatic extraction of task instances given a constraint structure +and a raw text corpus. Using COLLIE, we compile the COLLIE-v1 dataset with 2080 +instances comprising 13 constraint structures. We perform systematic +experiments across five state-of-the-art instruction-tuned language models and +analyze their performances to reveal shortcomings. COLLIE is designed to be +extensible and lightweight, and we hope the community finds it useful to +develop more complex constraints and evaluations in the future. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ Do Models Explain Themselves? Counterfactual Simulatability of Natural + Language Explanations + + +
+ Large language models (LLMs) are trained to imitate humans to explain human +decisions. However, do LLMs explain themselves? Can they help humans build +mental models of how LLMs process different inputs? To answer these questions, +we propose to evaluate $\textbf{counterfactual simulatability}$ of natural +language explanations: whether an explanation can enable humans to precisely +infer the model's outputs on diverse counterfactuals of the explained input. +For example, if a model answers "yes" to the input question "Can eagles fly?" +with the explanation "all birds can fly", then humans would infer from the +explanation that it would also answer "yes" to the counterfactual input "Can +penguins fly?". If the explanation is precise, then the model's answer should +match humans' expectations. + We implemented two metrics based on counterfactual simulatability: precision +and generality. We generated diverse counterfactuals automatically using LLMs. +We then used these metrics to evaluate state-of-the-art LLMs (e.g., GPT-4) on +two tasks: multi-hop factual reasoning and reward modeling. We found that LLM's +explanations have low precision and that precision does not correlate with +plausibility. Therefore, naively optimizing human approvals (e.g., RLHF) may +not be a sufficient solution. + +
+
+
+
+
+ + ☆ Multilingual Speech-to-Speech Translation into Multiple Target Languages + + +
+ Speech-to-speech translation (S2ST) enables spoken communication between +people talking in different languages. Despite a few studies on multilingual +S2ST, their focus is the multilinguality on the source side, i.e., the +translation from multiple source languages to one target language. We present +the first work on multilingual S2ST supporting multiple target languages. +Leveraging recent advance in direct S2ST with speech-to-unit and vocoder, we +equip these key components with multilingual capability. Speech-to-masked-unit +(S2MU) is the multilingual extension of S2U, which applies masking to units +which don't belong to the given target language to reduce the language +interference. We also propose multilingual vocoder which is trained with +language embedding and the auxiliary loss of language identification. On +benchmark translation testsets, our proposed multilingual model shows superior +performance than bilingual models in the translation from English into $16$ +target languages. + +
+
+
+
+
+ + ☆ Retentive Network: A Successor to Transformer for Large Language Models + + +
+ In this work, we propose Retentive Network (RetNet) as a foundation +architecture for large language models, simultaneously achieving training +parallelism, low-cost inference, and good performance. We theoretically derive +the connection between recurrence and attention. Then we propose the retention +mechanism for sequence modeling, which supports three computation paradigms, +i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel +representation allows for training parallelism. The recurrent representation +enables low-cost $O(1)$ inference, which improves decoding throughput, latency, +and GPU memory without sacrificing performance. The chunkwise recurrent +representation facilitates efficient long-sequence modeling with linear +complexity, where each chunk is encoded parallelly while recurrently +summarizing the chunks. Experimental results on language modeling show that +RetNet achieves favorable scaling results, parallel training, low-cost +deployment, and efficient inference. The intriguing properties make RetNet a +strong successor to Transformer for large language models. Code will be +available at https://aka.ms/retnet. + +
+
+
+
+
+ + ☆ Multimodal Diffusion Segmentation Model for Object Segmentation from + Manipulation Instructions IROS2023 + + +
+ In this study, we aim to develop a model that comprehends a natural language +instruction (e.g., "Go to the living room and get the nearest pillow to the +radio art on the wall") and generates a segmentation mask for the target +everyday object. The task is challenging because it requires (1) the +understanding of the referring expressions for multiple objects in the +instruction, (2) the prediction of the target phrase of the sentence among the +multiple phrases, and (3) the generation of pixel-wise segmentation masks +rather than bounding boxes. Studies have been conducted on languagebased +segmentation methods; however, they sometimes mask irrelevant regions for +complex sentences. In this paper, we propose the Multimodal Diffusion +Segmentation Model (MDSM), which generates a mask in the first stage and +refines it in the second stage. We introduce a crossmodal parallel feature +extraction mechanism and extend diffusion probabilistic models to handle +crossmodal features. To validate our model, we built a new dataset based on the +well-known Matterport3D and REVERIE datasets. This dataset consists of +instructions with complex referring expressions accompanied by real indoor +environmental images that feature various target objects, in addition to +pixel-wise segmentation masks. The performance of MDSM surpassed that of the +baseline method by a large margin of +10.13 mean IoU. + +
+
+ comment: Accepted for presentation at IROS2023 +
+
+
+
+
+ + ☆ Syntax-Aware Complex-Valued Neural Machine Translation + + +
+ Syntax has been proven to be remarkably effective in neural machine +translation (NMT). Previous models obtained syntax information from syntactic +parsing tools and integrated it into NMT models to improve translation +performance. In this work, we propose a method to incorporate syntax +information into a complex-valued Encoder-Decoder architecture. The proposed +model jointly learns word-level and syntax-level attention scores from the +source side to the target side using an attention mechanism. Importantly, it is +not dependent on specific network architectures and can be directly integrated +into any existing sequence-to-sequence (Seq2Seq) framework. The experimental +results demonstrate that the proposed method can bring significant improvements +in BLEU scores on two datasets. In particular, the proposed method achieves a +greater improvement in BLEU scores in translation tasks involving language +pairs with significant syntactic differences. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ The Resume Paradox: Greater Language Differences, Smaller Pay Gaps + + +
+ Over the past decade, the gender pay gap has remained steady with women +earning 84 cents for every dollar earned by men on average. Many studies +explain this gap through demand-side bias in the labor market represented +through employers' job postings. However, few studies analyze potential bias +from the worker supply-side. Here, we analyze the language in millions of US +workers' resumes to investigate how differences in workers' self-representation +by gender compare to differences in earnings. Across US occupations, language +differences between male and female resumes correspond to 11% of the variation +in gender pay gap. This suggests that females' resumes that are semantically +similar to males' resumes may have greater wage parity. However, surprisingly, +occupations with greater language differences between male and female resumes +have lower gender pay gaps. A doubling of the language difference between +female and male resumes results in an annual wage increase of $2,797 for the +average female worker. This result holds with controls for gender-biases of +resume text and we find that per-word bias poorly describes the variance in +wage gap. The results demonstrate that textual data and self-representation are +valuable factors for improving worker representations and understanding +employment inequities. + +
+
+ comment: 24 pages, 15 figures +
+
+
+
+
+ + ☆ Discovering collective narratives shifts in online discussions + + +
+ Narrative is a foundation of human cognition and decision making. Because +narratives play a crucial role in societal discourses and spread of +misinformation and because of the pervasive use of social media, the narrative +dynamics on social media can have profound societal impact. Yet, systematic and +computational understanding of online narratives faces critical challenge of +the scale and dynamics; how can we reliably and automatically extract +narratives from massive amount of texts? How do narratives emerge, spread, and +die? Here, we propose a systematic narrative discovery framework that fill this +gap by combining change point detection, semantic role labeling (SRL), and +automatic aggregation of narrative fragments into narrative networks. We +evaluate our model with synthetic and empirical data two-Twitter corpora about +COVID-19 and 2017 French Election. Results demonstrate that our approach can +recover major narrative shifts that correspond to the major events. + +
+
+
+
+
+ + ☆ Latent Jailbreak: A Benchmark for Evaluating Text Safety and Output + Robustness of Large Language Models + + +
+ Researchers have invested considerable effort into ensuring that large +language models (LLMs) align with human values, using various training +techniques, such as instruction tuning and Reinforcement Learning from Human or +AI Feedback (RLHF/RLAIF), to guard against text unsafety. However, these +defenses remain incredibly vulnerable to some jailbreak attacks, which can +cause the model to become overly defensive to sensitive topics or still +generate harmful content, leaving the model performance particularly fragile. +Therefore, to comprehensively study text safety and output robustness, we +propose a latent jailbreak prompt dataset, each involving malicious instruction +embedding. Specifically, we instruct the model to complete a regular task, such +as translation, where the text to be translated contains malicious +instructions. To further analyze the safety and robustness, we design a +hierarchical annotation framework. We present a systematic analysis of the +safety and robustness of LLMs concerning the position of explicit normal +instructions, word replacement (verbs in explicit normal instructions, target +groups in malicious instructions, cue words in malicious instructions), and +instruction replacement (different explicit normal instructions). Our results +show that current LLMs not only have a preference for certain instruction +verbs, but also exhibit different jailbreak rates for different instruction +verbs in explicit normal instructions. In other words, the probability of +generating unsafe content by the model will be reinforced to varying degrees +depending on the instruction verb in explicit normal instructions. Code and +data are available at https://github.com/qiuhuachuan/latent-jailbreak. + +
+
+ comment: Code and data are available at + https://github.com/qiuhuachuan/latent-jailbreak +
+
+
+
+
+ + ☆ Improving End-to-End Speech Translation by Imitation-Based Knowledge + Distillation with Synthetic Transcripts + + +
+ End-to-end automatic speech translation (AST) relies on data that combines +audio inputs with text translation outputs. Previous work used existing large +parallel corpora of transcriptions and translations in a knowledge distillation +(KD) setup to distill a neural machine translation (NMT) into an AST student +model. While KD allows using larger pretrained models, the reliance of previous +KD approaches on manual audio transcripts in the data pipeline restricts the +applicability of this framework to AST. We present an imitation learning +approach where a teacher NMT system corrects the errors of an AST student +without relying on manual transcripts. We show that the NMT teacher can recover +from errors in automatic transcriptions and is able to correct erroneous +translations of the AST student, leading to improvements of about 4 BLEU points +over the standard AST end-to-end baseline on the English-German CoVoST-2 and +MuST-C datasets, respectively. Code and data are publicly +available.\footnote{\url{https://github.com/HubReb/imitkd_ast/releases/tag/v1.1}} + +
+
+ comment: IWSLT 2023, corrected version +
+
+
+
+
+ + ☆ Enhancing Supervised Learning with Contrastive Markings in Neural + Machine Translation Training + + +
+ Supervised learning in Neural Machine Translation (NMT) typically follows a +teacher forcing paradigm where reference tokens constitute the conditioning +context in the model's prediction, instead of its own previous predictions. In +order to alleviate this lack of exploration in the space of translations, we +present a simple extension of standard maximum likelihood estimation by a +contrastive marking objective. The additional training signals are extracted +automatically from reference translations by comparing the system hypothesis +against the reference, and used for up/down-weighting correct/incorrect tokens. +The proposed new training procedure requires one additional translation pass +over the training set per epoch, and does not alter the standard inference +setup. We show that training with contrastive markings yields improvements on +top of supervised learning, and is especially useful when learning from +postedits where contrastive markings indicate human error corrections to the +original hypotheses. Code is publicly released. + +
+
+ comment: Proceedings of the 24th Annual Conference of the European Association + for Machine Translation, p. 69-78 Tampere, Finland, June 2023 +
+
+
+
+
+ + ☆ On the application of Large Language Models for language teaching and + assessment technology + + +
+ The recent release of very large language models such as PaLM and GPT-4 has +made an unprecedented impact in the popular media and public consciousness, +giving rise to a mixture of excitement and fear as to their capabilities and +potential uses, and shining a light on natural language processing research +which had not previously received so much attention. The developments offer +great promise for education technology, and in this paper we look specifically +at the potential for incorporating large language models in AI-driven language +teaching and assessment systems. We consider several research areas and also +discuss the risks and ethical considerations surrounding generative AI in +education technology for language learners. Overall we find that larger +language models offer improvements over previous models in text generation, +opening up routes toward content generation which had not previously been +plausible. For text generation they must be prompted carefully and their +outputs may need to be reshaped before they are ready for use. For automated +grading and grammatical error correction, tasks whose progress is checked on +well-known benchmarks, early investigations indicate that large language models +on their own do not improve on state-of-the-art results according to standard +evaluation metrics. For grading it appears that linguistic features established +in the literature should still be used for best performance, and for error +correction it may be that the models can offer alternative feedback styles +which are not measured sensitively with existing methods. In all cases, there +is work to be done to experiment with the inclusion of large language models in +education technology for language learners, in order to properly understand and +report on their capacities and limitations, and to ensure that foreseeable +risks such as misinformation and harmful bias are mitigated. + +
+
+ comment: Accepted at the AIED2023 workshop: Empowering Education with LLMs - + the Next-Gen Interface and Content Generation +
+
+
+
+
+ + ☆ Gender mobility in the labor market with skills-based matching models AAAI + + +
+ Skills-based matching promises mobility of workers between different sectors +and occupations in the labor market. In this case, job seekers can look for +jobs they do not yet have experience in, but for which they do have relevant +skills. Currently, there are multiple occupations with a skewed gender +distribution. For skills-based matching, it is unclear if and how a shift in +the gender distribution, which we call gender mobility, between occupations +will be effected. It is expected that the skills-based matching approach will +likely be data-driven, including computational language models and supervised +learning methods. + This work, first, shows the presence of gender segregation in language +model-based skills representation of occupations. Second, we assess the use of +these representations in a potential application based on simulated data, and +show that the gender segregation is propagated by various data-driven +skills-based matching models.These models are based on different language +representations (bag of words, word2vec, and BERT), and distance metrics +(static and machine learning-based). Accordingly, we show how skills-based +matching approaches can be evaluated and compared on matching performance as +well as on the risk of gender segregation. Making the gender segregation bias +of models more explicit can help in generating healthy trust in the use of +these models in practice. + +
+
+ comment: This paper was presented during the AAAI Spring Symposium 2023 (AI + Trustworthiness Assessment (AITA) track) +
+
+
+
+
+ + ☆ Legal Syllogism Prompting: Teaching Large Language Models for Legal + Judgment Prediction + + +
+ Legal syllogism is a form of deductive reasoning commonly used by legal +professionals to analyze cases. In this paper, we propose legal syllogism +prompting (LoT), a simple prompting method to teach large language models +(LLMs) for legal judgment prediction. LoT teaches only that in the legal +syllogism the major premise is law, the minor premise is the fact, and the +conclusion is judgment. Then the models can produce a syllogism reasoning of +the case and give the judgment without any learning, fine-tuning, or examples. +On CAIL2018, a Chinese criminal case dataset, we performed zero-shot judgment +prediction experiments with GPT-3 models. Our results show that LLMs with LoT +achieve better performance than the baseline and chain of thought prompting, +the state-of-art prompting method on diverse reasoning tasks. LoT enables the +model to concentrate on the key information relevant to the judgment and to +correctly understand the legal meaning of acts, as compared to other methods. +Our method enables LLMs to predict judgment along with law articles and +justification, which significantly enhances the explainability of models. + +
+
+ comment: Nineteenth International Conference on Artificial Intelligence and + Law (ICAIL 2023) +
+
+
+
+
+ + ☆ IterLara: A Turing Complete Algebra for Big Data, AI, Scientific + Computing, and Database + + +
+ \textsc{Lara} is a key-value algebra that aims at unifying linear and +relational algebra with three types of operation abstraction. The study of +\textsc{Lara}'s expressive ability reports that it can represent relational +algebra and most linear algebra operations. However, several essential +computations, such as matrix inversion and determinant, cannot be expressed in +\textsc{Lara}. \textsc{Lara} cannot represent global and iterative computation, +either. This article proposes \textsc{IterLara}, extending \textsc{Lara} with +iterative operators, to provide an algebraic model that unifies operations in +general-purpose computing, like big data, AI, scientific computing, and +database. We study the expressive ability of \textsc{Lara} and +\textsc{IterLara} and prove that \textsc{IterLara} with aggregation functions +can represent matrix inversion, determinant. Besides, we demonstrate that +\textsc{IterLara} with no limitation of function utility is Turing complete. We +also propose the Operation Count (OP) as a metric of computation amount for +\textsc{IterLara} and ensure that the OP metric is in accordance with the +existing computation metrics. + +
+
+
+
+
+ + ☆ CoAD: Automatic Diagnosis through Symptom and Disease Collaborative + Generation ACL 2023 + + +
+ Automatic diagnosis (AD), a critical application of AI in healthcare, employs +machine learning techniques to assist doctors in gathering patient symptom +information for precise disease diagnosis. The Transformer-based method +utilizes an input symptom sequence, predicts itself through auto-regression, +and employs the hidden state of the final symptom to determine the disease. +Despite its simplicity and superior performance demonstrated, a decline in +disease diagnosis accuracy is observed caused by 1) a mismatch between symptoms +observed during training and generation, and 2) the effect of different symptom +orders on disease prediction. To address the above obstacles, we introduce the +CoAD, a novel disease and symptom collaborative generation framework, which +incorporates several key innovations to improve AD: 1) aligning sentence-level +disease labels with multiple possible symptom inquiry steps to bridge the gap +between training and generation; 2) expanding symptom labels for each +sub-sequence of symptoms to enhance annotation and eliminate the effect of +symptom order; 3) developing a repeated symptom input schema to effectively and +efficiently learn the expanded disease and symptom labels. We evaluate the CoAD +framework using four datasets, including three public and one private, and +demonstrate that it achieves an average 2.3% improvement over previous +state-of-the-art results in automatic disease diagnosis. For reproducibility, +we release the code and data at https://github.com/KwanWaiChung/coad. + +
+
+ comment: Published as a conference paper at ACL 2023 (long). Code available at + https://github.com/KwanWaiChung/coad +
+
+
+
+
+ + ☆ ChatGPT is Good but Bing Chat is Better for Vietnamese Students + + +
+ This paper investigates the performance of two large language models (LLMs), +ChatGPT and Microsoft Bing Chat (BingChat), for Vietnamese students. While +ChatGPT demonstrates competency in various subjects, Bing Chat emerges as the +superior choice. We compare their performances across multiple subjects at high +school level, including mathematics, literature, English, physics, chemistry, +biology, history, geography, and civic education. Our findings indicate that +BingChat surpasses ChatGPT in most subjects, except for literature where +ChatGPT outperforms. Moreover, BingChat leverages the more advanced GPT-4 +technology compared to ChatGPT based on GPT-3.5, leading to enhanced +understanding and generation of creative and informative text. Furthermore, +BingChat's availability in Vietnam and its incorporation of hyperlinks in +answers further solidify its superiority. We conclude that while ChatGPT is +commendable, Bing Chat offers a more comprehensive and advanced solution for +Vietnamese students. + +
+
+ comment: 12 pages; 6 figures. arxiv admin note: text overlap with + arXiv:2305.12199 +
+
+
+
+
+ + ☆ Extending the Frontier of ChatGPT: Code Generation and Debugging + + +
+ Large-scale language models (LLMs) have emerged as a groundbreaking +innovation in the realm of question-answering and conversational agents. These +models, leveraging different deep learning architectures such as Transformers, +are trained on vast corpora to predict sentences based on given queries. Among +these LLMs, ChatGPT, developed by OpenAI, has ushered in a new era by utilizing +artificial intelligence (AI) to tackle diverse problem domains, ranging from +composing essays and biographies to solving intricate mathematical integrals. +The versatile applications enabled by ChatGPT offer immense value to users. +However, assessing the performance of ChatGPT's output poses a challenge, +particularly in scenarios where queries lack clear objective criteria for +correctness. For instance, evaluating the quality of generated essays becomes +arduous and relies heavily on manual labor, in stark contrast to evaluating +solutions to well-defined, closed-ended questions such as mathematical +problems. This research paper delves into the efficacy of ChatGPT in solving +programming problems, examining both the correctness and the efficiency of its +solution in terms of time and memory complexity. The research reveals a +commendable overall success rate of 71.875\%, denoting the proportion of +problems for which ChatGPT was able to provide correct solutions that +successfully satisfied all the test cases present in Leetcode. It exhibits +strengths in structured problems and shows a linear correlation between its +success rate and problem acceptance rates. However, it struggles to improve +solutions based on feedback, pointing to potential shortcomings in debugging +tasks. These findings provide a compact yet insightful glimpse into ChatGPT's +capabilities and areas for improvement. + +
+
+
+
+
+ + ☆ PAT: Parallel Attention Transformer for Visual Question Answering in + Vietnamese + + +
+ We present in this paper a novel scheme for multimodal learning named the +Parallel Attention mechanism. In addition, to take into account the advantages +of grammar and context in Vietnamese, we propose the Hierarchical Linguistic +Features Extractor instead of using an LSTM network to extract linguistic +features. Based on these two novel modules, we introduce the Parallel Attention +Transformer (PAT), achieving the best accuracy compared to all baselines on the +benchmark ViVQA dataset and other SOTA methods including SAAA and MCAN. + +
+
+
+
+
+ + ☆ BASS: Block-wise Adaptation for Speech Summarization + + +
+ End-to-end speech summarization has been shown to improve performance over +cascade baselines. However, such models are difficult to train on very large +inputs (dozens of minutes or hours) owing to compute restrictions and are hence +trained with truncated model inputs. Truncation leads to poorer models, and a +solution to this problem rests in block-wise modeling, i.e., processing a +portion of the input frames at a time. In this paper, we develop a method that +allows one to train summarization models on very long sequences in an +incremental manner. Speech summarization is realized as a streaming process, +where hypothesis summaries are updated every block based on new acoustic +information. We devise and test strategies to pass semantic context across the +blocks. Experiments on the How2 dataset demonstrate that the proposed +block-wise training method improves by 3 points absolute on ROUGE-L over a +truncated input baseline. + +
+
+ comment: Accepted at Interspeech 2023 +
+
+
+
+
+ + ☆ Mini-Giants: "Small" Language Models and Open Source Win-Win + + +
+ ChatGPT is phenomenal. However, it is prohibitively expensive to train and +refine such giant models. Fortunately, small language models are flourishing +and becoming more and more competent. We call them "mini-giants". We argue that +open source community like Kaggle and mini-giants will win-win in many ways, +technically, ethically and socially. In this article, we present a brief yet +rich background, discuss how to attain small language models, present a +comparative study of small language models and a brief discussion of evaluation +methods, discuss the application scenarios where small language models are most +needed in the real world, and conclude with discussion and outlook. + +
+
+ comment: 16 pages, 1 figure +
+
+
+
+
+ + ☆ Curriculum Learning for Graph Neural Networks: A Multiview + Competence-based Approach ACL 2023 + + +
+ A curriculum is a planned sequence of learning materials and an effective one +can make learning efficient and effective for both humans and machines. Recent +studies developed effective data-driven curriculum learning approaches for +training graph neural networks in language applications. However, existing +curriculum learning approaches often employ a single criterion of difficulty in +their training paradigms. In this paper, we propose a new perspective on +curriculum learning by introducing a novel approach that builds on graph +complexity formalisms (as difficulty criteria) and model competence during +training. The model consists of a scheduling scheme which derives effective +curricula by accounting for different views of sample difficulty and model +competence during training. The proposed solution advances existing research in +curriculum learning for graph neural networks with the ability to incorporate a +fine-grained spectrum of graph difficulty criteria in their training paradigms. +Experimental results on real-world link prediction and node classification +tasks illustrate the effectiveness of the proposed approach. + +
+
+ comment: ACL 2023 +
+
+
+
+
+ + ☆ Comparative Performance Evaluation of Large Language Models for + Extracting Molecular Interactions and Pathway Knowledge + + +
+ Understanding protein interactions and pathway knowledge is crucial for +unraveling the complexities of living systems and investigating the underlying +mechanisms of biological functions and complex diseases. While existing +databases provide curated biological data from literature and other sources, +they are often incomplete and their maintenance is labor-intensive, +necessitating alternative approaches. In this study, we propose to harness the +capabilities of large language models to address these issues by automatically +extracting such knowledge from the relevant scientific literature. Toward this +goal, in this work, we investigate the effectiveness of different large +language models in tasks that involve recognizing protein interactions, +pathways, and gene regulatory relations. We thoroughly evaluate the performance +of various models, highlight the significant findings, and discuss both the +future opportunities and the remaining challenges associated with this +approach. The code and data are available at: +https://github.com/boxorange/BioIE-LLM + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ A mixed policy to improve performance of language models on math + problems + + +
+ When to solve math problems, most language models take a sampling strategy to +predict next word according conditional probabilities. In the math reasoning +step, it may generate wrong answer. Considering math problems are +deterministic, we propose a mixed policy exploration approach to solve math +problems with reinforcement learning. In peculiar, we propose a two level token +exploration policy: the abstract level explores next token with probability and +the second level is deterministic. Specifically, the abstract level policy will +decide whether the token is operator or operand with probability sampling, +while the second level is deterministic to select next token with the highest +score in a greedy way. We test our method on GSM8K dataset with GPT-2 model, +and demonstrate more than $2\%$ performance gain. Our implementation is +available at https://github.com/vividitytech/math_lm_rl. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ How do software citation formats evolve over time? A longitudinal + analysis of R programming language packages + + +
+ Under the data-driven research paradigm, research software has come to play +crucial roles in nearly every stage of scientific inquiry. Scholars are +advocating for the formal citation of software in academic publications, +treating it on par with traditional research outputs. However, software is +hardly consistently cited: one software entity can be cited as different +objects, and the citations can change over time. These issues, however, are +largely overlooked in existing empirical research on software citation. To fill +the above gaps, the present study compares and analyzes a longitudinal dataset +of citation formats of all R packages collected in 2021 and 2022, in order to +understand the citation formats of R-language packages, important members in +the open-source software family, and how the citations evolve over time. In +particular, we investigate the different document types underlying the +citations and what metadata elements in the citation formats changed over time. +Furthermore, we offer an in-depth analysis of the disciplinarity of journal +articles cited as software (software papers). By undertaking this research, we +aim to contribute to a better understanding of the complexities associated with +software citation, shedding light on future software citation policies and +infrastructure. + +
+
+
+
+
+ + ☆ ivrit.ai: A Comprehensive Dataset of Hebrew Speech for AI Research and + Development + + +
+ We introduce "ivrit.ai", a comprehensive Hebrew speech dataset, addressing +the distinct lack of extensive, high-quality resources for advancing Automated +Speech Recognition (ASR) technology in Hebrew. With over 3,300 speech hours and +a over a thousand diverse speakers, ivrit.ai offers a substantial compilation +of Hebrew speech across various contexts. It is delivered in three forms to +cater to varying research needs: raw unprocessed audio; data post-Voice +Activity Detection, and partially transcribed data. The dataset stands out for +its legal accessibility, permitting use at no cost, thereby serving as a +crucial resource for researchers, developers, and commercial entities. ivrit.ai +opens up numerous applications, offering vast potential to enhance AI +capabilities in Hebrew. Future efforts aim to expand ivrit.ai further, thereby +advancing Hebrew's standing in AI research and technology. + +
+
+ comment: 9 pages, 1 table and 3 figures +
+
+
+
+
+ + ♻ ☆ Underspecification in Language Modeling Tasks: A Causality-Informed + Study of Gendered Pronoun Resolution + + +
+ Modern language modeling tasks are often underspecified: for a given token +prediction, many words may satisfy the user's intent of producing natural +language at inference time, however only one word would minimize the task's +loss function at training time. We provide a simple yet plausible causal +mechanism describing the role underspecification plays in the generation of +spurious correlations. Despite its simplicity, our causal model directly +informs the development of two lightweight black-box evaluation methods, that +we apply to gendered pronoun resolution tasks on a wide range of LLMs to 1) aid +in the detection of inference-time task underspecification by exploiting 2) +previously unreported gender vs. time and gender vs. location spurious +correlations on LLMs with a range of A) sizes: from BERT-base to GPT 3.5, B) +pre-training objectives: from masked & autoregressive language modeling to a +mixture of these objectives, and C) training stages: from pre-training only to +reinforcement learning from human feedback (RLHF). Code and open-source demos +available at https: //github.com/2dot71mily/sib_paper. + +
+
+ comment: 25 pages, 34 figures +
+
+
+
+
+ + ♻ ☆ Automated scholarly paper review: Concepts, technologies, and challenges + + +
+ Peer review is a widely accepted mechanism for research evaluation, playing a +pivotal role in academic publishing. However, criticisms have long been leveled +at this mechanism, mostly because of its poor efficiency and low +reproducibility. Recent years have seen the application of artificial +intelligence (AI) in assisting the peer review process. Nonetheless, with the +involvement of humans, such limitations remain inevitable. In this paper, we +propose the concept and pipeline of automated scholarly paper review (ASPR) and +review the relevant literature and technologies of achieving a full-scale +computerized review process. On the basis of the review and discussion, we +conclude that there is already corresponding research and preliminary +implementation at each stage of ASPR. We further look into the challenges in +ASPR with the existing technologies. The major difficulties lie in inadequate +data, imperfect document parsing and representation, defective +human$\unicode{x2013}$computer interaction, and flawed deep logical reasoning. +Moreover, we point out the future directions and discuss the possible moral and +ethical issues of ASPR. In the foreseeable future, ASPR and peer review will +coexist in a reinforcing manner before ASPR is able to fully undertake the +reviewing workload from humans. + +
+
+ comment: Please cite the version of Information Fusion +
+
+
+
+
+ + ♻ ☆ Parmesan: mathematical concept extraction for education + + +
+ Mathematics is a highly specialized domain with its own unique set of +challenges that has seen limited study in natural language processing. However, +mathematics is used in a wide variety of fields and multidisciplinary research +in many different domains often relies on an understanding of mathematical +concepts. To aid researchers coming from other fields, we develop a prototype +system for searching for and defining mathematical concepts in context, +focusing on the field of category theory. This system, Parmesan, depends on +natural language processing components including concept extraction, relation +extraction, definition extraction, and entity linking. In developing this +system, we show that existing techniques cannot be applied directly to the +category theory domain, and suggest hybrid techniques that do perform well, +though we expect the system to evolve over time. We also provide two cleaned +mathematical corpora that power the prototype system, which are based on +journal articles and wiki pages, respectively. The corpora have been annotated +with dependency trees, lemmas, and part-of-speech tags. + +
+
+
+
+
+ + ♻ ☆ SuS-X: Training-Free Name-Only Transfer of Vision-Language Models ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet +effective way to train large-scale vision-language models. CLIP demonstrates +impressive zero-shot classification and retrieval on diverse downstream tasks. +However, to leverage its full potential, fine-tuning still appears to be +necessary. Fine-tuning the entire CLIP model can be resource-intensive and +unstable. Moreover, recent methods that aim to circumvent this need for +fine-tuning still require access to images from the target distribution. In +this paper, we pursue a different approach and explore the regime of +training-free "name-only transfer" in which the only knowledge we possess about +the downstream task comprises the names of downstream target categories. We +propose a novel method, SuS-X, consisting of two key building blocks -- SuS and +TIP-X, that requires neither intensive fine-tuning nor costly labelled data. +SuS-X achieves state-of-the-art zero-shot classification results on 19 +benchmark datasets. We further show the utility of TIP-X in the training-free +few-shot setting, where we again achieve state-of-the-art results over strong +training-free baselines. Code is available at +https://github.com/vishaal27/SuS-X. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Cross-Modal Retrieval for Motion and Text via MildTriple Loss + + +
+ Cross-modal retrieval has become a prominent research topic in computer +vision and natural language processing with advances made in image-text and +video-text retrieval technologies. However, cross-modal retrieval between human +motion sequences and text has not garnered sufficient attention despite the +extensive application value it holds, such as aiding virtual reality +applications in better understanding users' actions and language. This task +presents several challenges, including joint modeling of the two modalities, +demanding the understanding of person-centered information from text, and +learning behavior features from 3D human motion sequences. Previous work on +motion data modeling mainly relied on autoregressive feature extractors that +may forget previous information, while we propose an innovative model that +includes simple yet powerful transformer-based motion and text encoders, which +can learn representations from the two different modalities and capture +long-term dependencies. Furthermore, the overlap of the same atomic actions of +different human motions can cause semantic conflicts, leading us to explore a +new triplet loss function, MildTriple Loss. it leverages the similarity between +samples in intra-modal space to guide soft-hard negative sample mining in the +joint embedding space to train the triplet loss and reduce the violation caused +by false negative samples. We evaluated our model and method on the latest +HumanML3D and KIT Motion-Language datasets, achieving a 62.9\% recall for +motion retrieval and a 71.5\% recall for text retrieval (based on R@10) on the +HumanML3D dataset. Our code is available at +https://github.com/eanson023/rehamot. + +
+
+ comment: This research was rejected by the submitted journal and needs to be + revised before submitting +
+
+
+
+
+ + ♻ ☆ A Neural Span-Based Continual Named Entity Recognition Model AAAI'23 + + +
+ Named Entity Recognition (NER) models capable of Continual Learning (CL) are +realistically valuable in areas where entity types continuously increase (e.g., +personal assistants). Meanwhile the learning paradigm of NER advances to new +patterns such as the span-based methods. However, its potential to CL has not +been fully explored. In this paper, we propose SpanKL, a simple yet effective +Span-based model with Knowledge distillation (KD) to preserve memories and +multi-Label prediction to prevent conflicts in CL-NER. Unlike prior sequence +labeling approaches, the inherently independent modeling in span and entity +level with the designed coherent optimization on SpanKL promotes its learning +at each incremental step and mitigates the forgetting. Experiments on synthetic +CL datasets derived from OntoNotes and Few-NERD show that SpanKL significantly +outperforms previous SoTA in many aspects, and obtains the smallest gap from CL +to the upper bound revealing its high practiced value. The code is available at +https://github.com/Qznan/SpanKL. + +
+
+ comment: Accepted by AAAI'23 (Update to official format) +
+
+
+
+
+ + ♻ ☆ Fuzzy Alignments in Directed Acyclic Graph for Non-Autoregressive + Machine Translation ICLR 2023 + + +
+ Non-autoregressive translation (NAT) reduces the decoding latency but suffers +from performance degradation due to the multi-modality problem. Recently, the +structure of directed acyclic graph has achieved great success in NAT, which +tackles the multi-modality problem by introducing dependency between vertices. +However, training it with negative log-likelihood loss implicitly requires a +strict alignment between reference tokens and vertices, weakening its ability +to handle multiple translation modalities. In this paper, we hold the view that +all paths in the graph are fuzzily aligned with the reference sentence. We do +not require the exact alignment but train the model to maximize a fuzzy +alignment score between the graph and reference, which takes captured +translations in all modalities into account. Extensive experiments on major WMT +benchmarks show that our method substantially improves translation performance +and increases prediction confidence, setting a new state of the art for NAT on +the raw training data. + +
+
+ comment: ICLR 2023 +
+
+
+
+
+ + ♻ ☆ RoPDA: Robust Prompt-based Data Augmentation for Low-Resource Named + Entity Recognition + + +
+ Data augmentation has been widely used in low-resource NER tasks to tackle +the problem of data sparsity. However, previous data augmentation methods have +the disadvantages of disrupted syntactic structures, token-label mismatch, and +requirement for external knowledge or manual effort. To address these issues, +we propose Robust Prompt-based Data Augmentation (RoPDA) for low-resource NER. +Based on pre-trained language models (PLMs) with continuous prompt, RoPDA +performs entity augmentation and context augmentation through five fundamental +augmentation operations to generate label-flipping and label-preserving +examples. To optimize the utilization of the augmented samples, we present two +techniques: Self-Consistency Filtering and mixup. The former effectively +eliminates low-quality samples, while the latter prevents performance +degradation arising from the direct utilization of label-flipping samples. +Extensive experiments on three benchmarks from different domains demonstrate +that RoPDA significantly improves upon strong baselines, and also outperforms +state-of-the-art semi-supervised learning methods when unlabeled data is +included. + +
+
+
+
+
+ + ♻ ☆ Pre-trained Language Models in Biomedical Domain: A Systematic Survey + + +
+ Pre-trained language models (PLMs) have been the de facto paradigm for most +natural language processing (NLP) tasks. This also benefits biomedical domain: +researchers from informatics, medicine, and computer science (CS) communities +propose various PLMs trained on biomedical datasets, e.g., biomedical text, +electronic health records, protein, and DNA sequences for various biomedical +tasks. However, the cross-discipline characteristics of biomedical PLMs hinder +their spreading among communities; some existing works are isolated from each +other without comprehensive comparison and discussions. It expects a survey +that not only systematically reviews recent advances of biomedical PLMs and +their applications but also standardizes terminology and benchmarks. In this +paper, we summarize the recent progress of pre-trained language models in the +biomedical domain and their applications in biomedical downstream tasks. +Particularly, we discuss the motivations and propose a taxonomy of existing +biomedical PLMs. Their applications in biomedical downstream tasks are +exhaustively discussed. At last, we illustrate various limitations and future +trends, which we hope can provide inspiration for the future research of the +research community. + +
+
+ comment: Accepted in ACM Computing Surveys +
+
+
+
+
+ + ♻ ☆ Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs + + +
+ Large language models (LLMs) have achieved widespread success on a variety of +in-context few-shot tasks, but this success is typically evaluated via +correctness rather than consistency. We argue that self-consistency is an +important criteria for valid multi-step reasoning in tasks where the solution +is composed of the answers to multiple sub-steps. We propose two types of +self-consistency that are particularly important for multi-step reasoning -- +hypothetical consistency (a model's ability to predict what its output would be +in a hypothetical other context) and compositional consistency (consistency of +a model's final outputs when intermediate sub-steps are replaced with the +model's outputs for those steps). We demonstrate that multiple variants of the +GPT-3/-4 models exhibit poor consistency rates across both types of consistency +on a variety of tasks. + +
+
+ comment: Added GPT-4 results +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 117 + +
+
+
+ + ☆ Diffusion Models Beat GANs on Image Classification + + +
+ While many unsupervised learning models focus on one family of tasks, either +generative or discriminative, we explore the possibility of a unified +representation learner: a model which uses a single pre-training stage to +address both families of tasks simultaneously. We identify diffusion models as +a prime candidate. Diffusion models have risen to prominence as a +state-of-the-art method for image generation, denoising, inpainting, +super-resolution, manipulation, etc. Such models involve training a U-Net to +iteratively predict and remove noise, and the resulting model can synthesize +high fidelity, diverse, novel images. The U-Net architecture, as a +convolution-based architecture, generates a diverse set of feature +representations in the form of intermediate feature maps. We present our +findings that these embeddings are useful beyond the noise prediction task, as +they contain discriminative information and can also be leveraged for +classification. We explore optimal methods for extracting and using these +embeddings for classification tasks, demonstrating promising results on the +ImageNet classification task. We find that with careful feature selection and +pooling, diffusion models outperform comparable generative-discriminative +methods such as BigBiGAN for classification tasks. We investigate diffusion +models in the transfer learning regime, examining their performance on several +fine-grained visual classification datasets. We compare these embeddings to +those generated by competing architectures and pre-trainings for classification +tasks. + +
+
+ comment: 15 pages, 7 figures, 10 tables, submission under review +
+
+
+
+
+ + ☆ Fast model inference and training on-board of Satellites + + +
+ Artificial intelligence onboard satellites has the potential to reduce data +transmission requirements, enable real-time decision-making and collaboration +within constellations. This study deploys a lightweight foundational model +called RaVAEn on D-Orbit's ION SCV004 satellite. RaVAEn is a variational +auto-encoder (VAE) that generates compressed latent vectors from small image +tiles, enabling several downstream tasks. In this work we demonstrate the +reliable use of RaVAEn onboard a satellite, achieving an encoding time of +0.110s for tiles of a 4.8x4.8 km$^2$ area. In addition, we showcase fast +few-shot training onboard a satellite using the latent representation of data. +We compare the deployment of the model on the on-board CPU and on the available +Myriad vision processing unit (VPU) accelerator. To our knowledge, this work +shows for the first time the deployment of a multi-task model on-board a +CubeSat and the on-board training of a machine learning model. + +
+
+ comment: 4 pages, 4 figures, International Geoscience and Remote Sensing + Symposium (IGARSS) 2023 +
+
+
+
+
+ + ☆ Pair then Relation: Pair-Net for Panoptic Scene Graph Generation + + +
+ Panoptic Scene Graph (PSG) is a challenging task in Scene Graph Generation +(SGG) that aims to create a more comprehensive scene graph representation using +panoptic segmentation instead of boxes. However, current PSG methods have +limited performance, which can hinder downstream task development. To improve +PSG methods, we conducted an in-depth analysis to identify the bottleneck of +the current PSG models, finding that inter-object pair-wise recall is a crucial +factor which was ignored by previous PSG methods. Based on this, we present a +novel framework: Pair then Relation (Pair-Net), which uses a Pair Proposal +Network (PPN) to learn and filter sparse pair-wise relationships between +subjects and objects. We also observed the sparse nature of object pairs and +used this insight to design a lightweight Matrix Learner within the PPN. +Through extensive ablation and analysis, our approach significantly improves +upon leveraging the strong segmenter baseline. Notably, our approach achieves +new state-of-the-art results on the PSG benchmark, with over 10% absolute gains +compared to PSGFormer. The code of this paper is publicly available at +https://github.com/king159/Pair-Net. + +
+
+ comment: Project Page: https://github.com/king159/Pair-Net +
+
+
+
+
+ + ☆ Flow Matching in Latent Space + + +
+ Flow matching is a recent framework to train generative models that exhibits +impressive empirical performance while being relatively easier to train +compared with diffusion-based models. Despite its advantageous properties, +prior methods still face the challenges of expensive computing and a large +number of function evaluations of off-the-shelf solvers in the pixel space. +Furthermore, although latent-based generative methods have shown great success +in recent years, this particular model type remains underexplored in this area. +In this work, we propose to apply flow matching in the latent spaces of +pretrained autoencoders, which offers improved computational efficiency and +scalability for high-resolution image synthesis. This enables flow-matching +training on constrained computational resources while maintaining their quality +and flexibility. Additionally, our work stands as a pioneering contribution in +the integration of various conditions into flow matching for conditional +generation tasks, including label-conditioned image generation, image +inpainting, and semantic-to-image generation. Through extensive experiments, +our approach demonstrates its effectiveness in both quantitative and +qualitative results on various datasets, such as CelebA-HQ, FFHQ, LSUN Church & +Bedroom, and ImageNet. We also provide a theoretical control of the +Wasserstein-2 distance between the reconstructed latent flow distribution and +true data distribution, showing it is upper-bounded by the latent flow matching +objective. Our code will be available at +https://github.com/VinAIResearch/LFM.git. + +
+
+ comment: Project Page: https://vinairesearch.github.io/LFM/ +
+
+
+
+
+ + ☆ Neural Video Depth Stabilizer ICCV2023 + + +
+ Video depth estimation aims to infer temporally consistent depth. Some +methods achieve temporal consistency by finetuning a single-image depth model +during test time using geometry and re-projection constraints, which is +inefficient and not robust. An alternative approach is to learn how to enforce +temporal consistency from data, but this requires well-designed models and +sufficient video depth data. To address these challenges, we propose a +plug-and-play framework called Neural Video Depth Stabilizer (NVDS) that +stabilizes inconsistent depth estimations and can be applied to different +single-image depth models without extra effort. We also introduce a large-scale +dataset, Video Depth in the Wild (VDW), which consists of 14,203 videos with +over two million frames, making it the largest natural-scene video depth +dataset to our knowledge. We evaluate our method on the VDW dataset as well as +two public benchmarks and demonstrate significant improvements in consistency, +accuracy, and efficiency compared to previous approaches. Our work serves as a +solid baseline and provides a data foundation for learning-based video depth +models. We will release our dataset and code for future research. + +
+
+ comment: Our paper is accepted by ICCV2023 +
+
+
+
+
+ + ☆ SEMI-DiffusionInst: A Diffusion Model Based Approach for Semiconductor + Defect Classification and Segmentation + + +
+ With continuous progression of Moore's Law, integrated circuit (IC) device +complexity is also increasing. Scanning Electron Microscope (SEM) image based +extensive defect inspection and accurate metrology extraction are two main +challenges in advanced node (2 nm and beyond) technology. Deep learning (DL) +algorithm based computer vision approaches gained popularity in semiconductor +defect inspection over last few years. In this research work, a new +semiconductor defect inspection framework "SEMI-DiffusionInst" is investigated +and compared to previous frameworks. To the best of the authors' knowledge, +this work is the first demonstration to accurately detect and precisely segment +semiconductor defect patterns by using a diffusion model. Different feature +extractor networks as backbones and data sampling strategies are investigated +towards achieving a balanced trade-off between precision and computing +efficiency. Our proposed approach outperforms previous work on overall mAP and +performs comparatively better or as per for almost all defect classes (per +class APs). The bounding box and segmentation mAPs achieved by the proposed +SEMI-DiffusionInst model are improved by 3.83% and 2.10%,respectively. Among +individual defect types, precision on line collapse and thin bridge defects are +improved approximately 15% on detection task for both defect types. It has also +been shown that by tuning inference hyperparameters, inference time can be +improved significantly without compromising model precision. Finally, certain +limitations and future work strategy to overcome them are discussed. + +
+
+ comment: 6 pages, 5 figures, To be published by IEEE in the proceedings of the + 2023 ELMAR conference +
+
+
+
+
+ + ☆ Implementation of a perception system for autonomous vehicles using a + detection-segmentation network in SoC FPGA + + +
+ Perception and control systems for autonomous vehicles are an active area of +scientific and industrial research. These solutions should be characterised by +high efficiency in recognising obstacles and other environmental elements in +different road conditions, real-time capability, and energy efficiency. +Achieving such functionality requires an appropriate algorithm and a suitable +computing platform. In this paper, we have used the MultiTaskV3 +detection-segmentation network as the basis for a perception system that can +perform both functionalities within a single architecture. It was appropriately +trained, quantised, and implemented on the AMD Xilinx Kria KV260 Vision AI +embedded platform. By using this device, it was possible to parallelise and +accelerate the computations. Furthermore, the whole system consumes relatively +little power compared to a CPU-based implementation (an average of 5 watts, +compared to the minimum of 55 watts for weaker CPUs, and the small size (119mm +x 140mm x 36mm) of the platform allows it to be used in devices where the +amount of space available is limited. It also achieves an accuracy higher than +97% of the mAP (mean average precision) for object detection and above 90% of +the mIoU (mean intersection over union) for image segmentation. The article +also details the design of the Mecanum wheel vehicle, which was used to test +the proposed solution in a mock-up city. + +
+
+ comment: The paper was accepted for the 19th International Symposium on + Applied Reconfigurable Computing - ARC 2023, Cottbus - Germany +
+
+
+
+
+ + ☆ CohortFinder: an open-source tool for data-driven partitioning of + biomedical image cohorts to yield robust machine learning models + + +
+ Batch effects (BEs) refer to systematic technical differences in data +collection unrelated to biological variations whose noise is shown to +negatively impact machine learning (ML) model generalizability. Here we release +CohortFinder, an open-source tool aimed at mitigating BEs via data-driven +cohort partitioning. We demonstrate CohortFinder improves ML model performance +in downstream medical image processing tasks. CohortFinder is freely available +for download at cohortfinder.com. + +
+
+ comment: 26 pages, 9 figures, 4 tables. Abstract was accepted by European + Society of Digital and Integrative Pathology (ESDIP), Germany, 2022 +
+
+
+
+
+ + ☆ Quaternion Convolutional Neural Networks: Current Advances and Future + Directions + + +
+ Since their first applications, Convolutional Neural Networks (CNNs) have +solved problems that have advanced the state-of-the-art in several domains. +CNNs represent information using real numbers. Despite encouraging results, +theoretical analysis shows that representations such as hyper-complex numbers +can achieve richer representational capacities than real numbers, and that +Hamilton products can capture intrinsic interchannel relationships. Moreover, +in the last few years, experimental research has shown that Quaternion-Valued +CNNs (QCNNs) can achieve similar performance with fewer parameters than their +real-valued counterparts. This paper condenses research in the development of +QCNNs from its very beginnings. We propose a conceptual organization of current +trends and analyze the main building blocks used in the design of QCNN models. +Based on this conceptual organization, we propose future directions of +research. + +
+
+
+
+
+ + ☆ PolyGNN: Polyhedron-based Graph Neural Network for 3D Building + Reconstruction from Point Clouds + + +
+ We present PolyGNN, a polyhedron-based graph neural network for 3D building +reconstruction from point clouds. PolyGNN learns to assemble primitives +obtained by polyhedral decomposition via graph node classification, achieving a +watertight, compact, and weakly semantic reconstruction. To effectively +represent arbitrary-shaped polyhedra in the neural network, we propose three +different sampling strategies to select representative points as +polyhedron-wise queries, enabling efficient occupancy inference. Furthermore, +we incorporate the inter-polyhedron adjacency to enhance the classification of +the graph nodes. We also observe that existing city-building models are +abstractions of the underlying instances. To address this abstraction gap and +provide a fair evaluation of the proposed method, we develop our method on a +large-scale synthetic dataset covering 500k+ buildings with well-defined ground +truths of polyhedral class labels. We further conduct a transferability +analysis across cities and on real-world point clouds. Both qualitative and +quantitative results demonstrate the effectiveness of our method, particularly +its efficiency for large-scale reconstructions. The source code and data of our +work are available at https://github.com/chenzhaiyu/polygnn. + +
+
+
+
+
+ + ☆ Deficiency-Aware Masked Transformer for Video Inpainting + + +
+ Recent video inpainting methods have made remarkable progress by utilizing +explicit guidance, such as optical flow, to propagate cross-frame pixels. +However, there are cases where cross-frame recurrence of the masked video is +not available, resulting in a deficiency. In such situation, instead of +borrowing pixels from other frames, the focus of the model shifts towards +addressing the inverse problem. In this paper, we introduce a +dual-modality-compatible inpainting framework called Deficiency-aware Masked +Transformer (DMT), which offers three key advantages. Firstly, we pretrain a +image inpainting model DMT_img serve as a prior for distilling the video model +DMT_vid, thereby benefiting the hallucination of deficiency cases. Secondly, +the self-attention module selectively incorporates spatiotemporal tokens to +accelerate inference and remove noise signals. Thirdly, a simple yet effective +Receptive Field Contextualizer is integrated into DMT, further improving +performance. Extensive experiments conducted on YouTube-VOS and DAVIS datasets +demonstrate that DMT_vid significantly outperforms previous solutions. The code +and video demonstrations can be found at github.com/yeates/DMT. + +
+
+
+
+
+ + ☆ Benchmarking fixed-length Fingerprint Representations across different + Embedding Sizes and Sensor Types + + +
+ Traditional minutiae-based fingerprint representations consist of a +variable-length set of minutiae. This necessitates a more complex comparison +causing the drawback of high computational cost in one-to-many comparison. +Recently, deep neural networks have been proposed to extract fixed-length +embeddings from fingerprints. In this paper, we explore to what extent +fingerprint texture information contained in such embeddings can be reduced in +terms of dimension while preserving high biometric performance. This is of +particular interest since it would allow to reduce the number of operations +incurred at comparisons. We also study the impact in terms of recognition +performance of the fingerprint textural information for two sensor types, i.e. +optical and capacitive. Furthermore, the impact of rotation and translation of +fingerprint images on the extraction of fingerprint embeddings is analysed. +Experimental results conducted on a publicly available database reveal an +optimal embedding size of 512 feature elements for the texture-based embedding +part of fixed-length fingerprint representations. In addition, differences in +performance between sensor types can be perceived. + +
+
+
+
+
+ + ☆ Multimodal Diffusion Segmentation Model for Object Segmentation from + Manipulation Instructions IROS2023 + + +
+ In this study, we aim to develop a model that comprehends a natural language +instruction (e.g., "Go to the living room and get the nearest pillow to the +radio art on the wall") and generates a segmentation mask for the target +everyday object. The task is challenging because it requires (1) the +understanding of the referring expressions for multiple objects in the +instruction, (2) the prediction of the target phrase of the sentence among the +multiple phrases, and (3) the generation of pixel-wise segmentation masks +rather than bounding boxes. Studies have been conducted on languagebased +segmentation methods; however, they sometimes mask irrelevant regions for +complex sentences. In this paper, we propose the Multimodal Diffusion +Segmentation Model (MDSM), which generates a mask in the first stage and +refines it in the second stage. We introduce a crossmodal parallel feature +extraction mechanism and extend diffusion probabilistic models to handle +crossmodal features. To validate our model, we built a new dataset based on the +well-known Matterport3D and REVERIE datasets. This dataset consists of +instructions with complex referring expressions accompanied by real indoor +environmental images that feature various target objects, in addition to +pixel-wise segmentation masks. The performance of MDSM surpassed that of the +baseline method by a large margin of +10.13 mean IoU. + +
+
+ comment: Accepted for presentation at IROS2023 +
+
+
+
+
+ + ☆ Identity-Preserving Aging of Face Images via Latent Diffusion Models + + +
+ The performance of automated face recognition systems is inevitably impacted +by the facial aging process. However, high quality datasets of individuals +collected over several years are typically small in scale. In this work, we +propose, train, and validate the use of latent text-to-image diffusion models +for synthetically aging and de-aging face images. Our models succeed with +few-shot training, and have the added benefit of being controllable via +intuitive textual prompting. We observe high degrees of visual realism in the +generated images while maintaining biometric fidelity measured by commonly used +metrics. We evaluate our method on two benchmark datasets (CelebA and AgeDB) +and observe significant reduction (~44%) in the False Non-Match Rate compared +to existing state-of the-art baselines. + +
+
+ comment: Accepted to appear in International Joint Conference in Biometrics + (IJCB) 2023 +
+
+
+
+
+ + ☆ BuboGPT: Enabling Visual Grounding in Multi-Modal LLMs + + +
+ LLMs have demonstrated remarkable abilities at interacting with humans +through language, especially with the usage of instruction-following data. +Recent advancements in LLMs, such as MiniGPT-4, LLaVA, and X-LLM, further +enlarge their abilities by incorporating multi-modal inputs, including image, +video, and speech. Despite their effectiveness at generating precise and +detailed language understanding of the given modality signal, these LLMs give +up the ability to ground specific parts of inputs, thus only constructing a +coarse-grained mapping. However, explicit and informative correspondence +between text and other modalities will not only improve the user experience but +also help to expand the application scenario of multi-modal LLMs. Therefore, we +propose BuboGPT, a multi-modal LLM with visual grounding that can perform +cross-modal interaction between vision, audio and language, providing +fine-grained understanding of visual objects and other given modalities. As a +result, BuboGPT is able to point out the specific location of an object in the +image, when it is generating response or description for that object. Our +contributions are two-fold: 1) An off-the-shelf visual grounding module based +on SAM that extracts entities in a sentence and find corresponding masks in the +image. 2) A two-stage training scheme and instruction dataset to endow joint +text-image-audio understanding. Our experiments show that BuboGPT achieves +impressive multi-modality understanding and visual grounding abilities during +the interaction with human. It performs consistently well when provided by +arbitrary modality combinations (either aligned or unaligned). Our code, model +and dataset are available at https://bubo-gpt.github.io . + +
+
+
+
+
+ + ☆ Scale-Aware Modulation Meet Transformer ICCV 2023 + + +
+ This paper presents a new vision Transformer, Scale-Aware Modulation +Transformer (SMT), that can handle various downstream tasks efficiently by +combining the convolutional network and vision Transformer. The proposed +Scale-Aware Modulation (SAM) in the SMT includes two primary novel designs. +Firstly, we introduce the Multi-Head Mixed Convolution (MHMC) module, which can +capture multi-scale features and expand the receptive field. Secondly, we +propose the Scale-Aware Aggregation (SAA) module, which is lightweight but +effective, enabling information fusion across different heads. By leveraging +these two modules, convolutional modulation is further enhanced. Furthermore, +in contrast to prior works that utilized modulations throughout all stages to +build an attention-free network, we propose an Evolutionary Hybrid Network +(EHN), which can effectively simulate the shift from capturing local to global +dependencies as the network becomes deeper, resulting in superior performance. +Extensive experiments demonstrate that SMT significantly outperforms existing +state-of-the-art models across a wide range of visual tasks. Specifically, SMT +with 11.5M / 2.4GFLOPs and 32M / 7.7GFLOPs can achieve 82.2% and 84.3% top-1 +accuracy on ImageNet-1K, respectively. After pretrained on ImageNet-22K in +224^2 resolution, it attains 87.1% and 88.1% top-1 accuracy when finetuned with +resolution 224^2 and 384^2, respectively. For object detection with Mask R-CNN, +the SMT base trained with 1x and 3x schedule outperforms the Swin Transformer +counterpart by 4.2 and 1.3 mAP on COCO, respectively. For semantic segmentation +with UPerNet, the SMT base test at single- and multi-scale surpasses Swin by +2.0 and 1.1 mIoU respectively on the ADE20K. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ On the Fly Neural Style Smoothing for Risk-Averse Domain Generalization + + +
+ Achieving high accuracy on data from domains unseen during training is a +fundamental challenge in domain generalization (DG). While state-of-the-art DG +classifiers have demonstrated impressive performance across various tasks, they +have shown a bias towards domain-dependent information, such as image styles, +rather than domain-invariant information, such as image content. This bias +renders them unreliable for deployment in risk-sensitive scenarios such as +autonomous driving where a misclassification could lead to catastrophic +consequences. To enable risk-averse predictions from a DG classifier, we +propose a novel inference procedure, Test-Time Neural Style Smoothing (TT-NSS), +that uses a "style-smoothed" version of the DG classifier for prediction at +test time. Specifically, the style-smoothed classifier classifies a test image +as the most probable class predicted by the DG classifier on random +re-stylizations of the test image. TT-NSS uses a neural style transfer module +to stylize a test image on the fly, requires only black-box access to the DG +classifier, and crucially, abstains when predictions of the DG classifier on +the stylized test images lack consensus. Additionally, we propose a neural +style smoothing (NSS) based training procedure that can be seamlessly +integrated with existing DG methods. This procedure enhances prediction +consistency, improving the performance of TT-NSS on non-abstained samples. Our +empirical results demonstrate the effectiveness of TT-NSS and NSS at producing +and improving risk-averse predictions on unseen domains from DG classifiers +trained with SOTA training methods on various benchmark datasets and their +variations. + +
+
+
+
+
+ + ☆ Improving Data Efficiency for Plant Cover Prediction with Label + Interpolation and Monte-Carlo Cropping + + +
+ The plant community composition is an essential indicator of environmental +changes and is, for this reason, usually analyzed in ecological field studies +in terms of the so-called plant cover. The manual acquisition of this kind of +data is time-consuming, laborious, and prone to human error. Automated camera +systems can collect high-resolution images of the surveyed vegetation plots at +a high frequency. In combination with subsequent algorithmic analysis, it is +possible to objectively extract information on plant community composition +quickly and with little human effort. An automated camera system can easily +collect the large amounts of image data necessary to train a Deep Learning +system for automatic analysis. However, due to the amount of work required to +annotate vegetation images with plant cover data, only few labeled samples are +available. As automated camera systems can collect many pictures without +labels, we introduce an approach to interpolate the sparse labels in the +collected vegetation plot time series down to the intermediate dense and +unlabeled images to artificially increase our training dataset to seven times +its original size. Moreover, we introduce a new method we call Monte-Carlo +Cropping. This approach trains on a collection of cropped parts of the training +images to deal with high-resolution images efficiently, implicitly augment the +training images, and speed up training. We evaluate both approaches on a plant +cover dataset containing images of herbaceous plant communities and find that +our methods lead to improvements in the species, community, and segmentation +metrics investigated. + +
+
+ comment: Accepted for publication at DAGM-GCPR 2023 +
+
+
+
+
+ + ☆ Reconstructed Convolution Module Based Look-Up Tables for Efficient + Image Super-Resolution + + +
+ Look-up table(LUT)-based methods have shown the great efficacy in single +image super-resolution (SR) task. However, previous methods ignore the +essential reason of restricted receptive field (RF) size in LUT, which is +caused by the interaction of space and channel features in vanilla convolution. +They can only increase the RF at the cost of linearly increasing LUT size. To +enlarge RF with contained LUT sizes, we propose a novel Reconstructed +Convolution(RC) module, which decouples channel-wise and spatial calculation. +It can be formulated as $n^2$ 1D LUTs to maintain $n\times n$ receptive field, +which is obviously smaller than $n\times n$D LUT formulated before. The LUT +generated by our RC module reaches less than 1/10000 storage compared with +SR-LUT baseline. The proposed Reconstructed Convolution module based LUT +method, termed as RCLUT, can enlarge the RF size by 9 times than the +state-of-the-art LUT-based SR method and achieve superior performance on five +popular benchmark dataset. Moreover, the efficient and robust RC module can be +used as a plugin to improve other LUT-based SR methods. The code is available +at https://github.com/liuguandu/RC-LUT. + +
+
+
+
+
+ + ☆ Variational Probabilistic Fusion Network for RGB-T Semantic Segmentation + + +
+ RGB-T semantic segmentation has been widely adopted to handle hard scenes +with poor lighting conditions by fusing different modality features of RGB and +thermal images. Existing methods try to find an optimal fusion feature for +segmentation, resulting in sensitivity to modality noise, class-imbalance, and +modality bias. To overcome the problems, this paper proposes a novel +Variational Probabilistic Fusion Network (VPFNet), which regards fusion +features as random variables and obtains robust segmentation by averaging +segmentation results under multiple samples of fusion features. The random +samples generation of fusion features in VPFNet is realized by a novel +Variational Feature Fusion Module (VFFM) designed based on variation attention. +To further avoid class-imbalance and modality bias, we employ the weighted +cross-entropy loss and introduce prior information of illumination and category +to control the proposed VFFM. Experimental results on MFNet and PST900 datasets +demonstrate that the proposed VPFNet can achieve state-of-the-art segmentation +performance. + +
+
+
+
+
+ + ☆ Multi-class point cloud completion networks for 3D cardiac anatomy + reconstruction from cine magnetic resonance images + + +
+ Cine magnetic resonance imaging (MRI) is the current gold standard for the +assessment of cardiac anatomy and function. However, it typically only acquires +a set of two-dimensional (2D) slices of the underlying three-dimensional (3D) +anatomy of the heart, thus limiting the understanding and analysis of both +healthy and pathological cardiac morphology and physiology. In this paper, we +propose a novel fully automatic surface reconstruction pipeline capable of +reconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI +acquisitions. Its key component is a multi-class point cloud completion network +(PCCN) capable of correcting both the sparsity and misalignment issues of the +3D reconstruction task in a unified model. We first evaluate the PCCN on a +large synthetic dataset of biventricular anatomies and observe Chamfer +distances between reconstructed and gold standard anatomies below or similar to +the underlying image resolution for multiple levels of slice misalignment. +Furthermore, we find a reduction in reconstruction error compared to a +benchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean +surface distance, respectively. We then apply the PCCN as part of our automated +reconstruction pipeline to 1000 subjects from the UK Biobank study in a +cross-domain transfer setting and demonstrate its ability to reconstruct +accurate and topologically plausible biventricular heart meshes with clinical +metrics comparable to the previous literature. Finally, we investigate the +robustness of our proposed approach and observe its capacity to successfully +handle multiple common outlier conditions. + +
+
+
+
+
+ + ☆ Multi-Domain Learning with Modulation Adapters + + +
+ Deep convolutional networks are ubiquitous in computer vision, due to their +excellent performance across different tasks for various domains. Models are, +however, often trained in isolation for each task, failing to exploit +relatedness between tasks and domains to learn more compact models that +generalise better in low-data regimes. Multi-domain learning aims to handle +related tasks, such as image classification across multiple domains, +simultaneously. Previous work on this problem explored the use of a pre-trained +and fixed domain-agnostic base network, in combination with smaller learnable +domain-specific adaptation modules. In this paper, we introduce Modulation +Adapters, which update the convolutional filter weights of the model in a +multiplicative manner for each task. Parameterising these adaptation weights in +a factored manner allows us to scale the number of per-task parameters in a +flexible manner, and to strike different parameter-accuracy trade-offs. We +evaluate our approach on the Visual Decathlon challenge, composed of ten image +classification tasks across different domains, and on the ImageNet-to-Sketch +benchmark, which consists of six image classification tasks. Our approach +yields excellent results, with accuracies that are comparable to or better than +those of existing state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Image Captions are Natural Prompts for Text-to-Image Models + + +
+ With the rapid development of Artificial Intelligence Generated Content +(AIGC), it has become common practice in many learning tasks to train or +fine-tune large models on synthetic data due to the data-scarcity and privacy +leakage problems. Albeit promising with unlimited data generation, owing to +massive and diverse information conveyed in real images, it is challenging for +text-to-image generative models to synthesize informative training data with +hand-crafted prompts, which usually leads to inferior generalization +performance when training downstream models. In this paper, we theoretically +analyze the relationship between the training effect of synthetic data and the +synthetic data distribution induced by prompts. Then we correspondingly propose +a simple yet effective method that prompts text-to-image generative models to +synthesize more informative and diverse training data. Specifically, we caption +each real image with the advanced captioning model to obtain informative and +faithful prompts that extract class-relevant information and clarify the +polysemy of class names. The image captions and class names are concatenated to +prompt generative models for training image synthesis. Extensive experiments on +ImageNette, ImageNet-100, and ImageNet-1K verify that our method significantly +improves the performance of models trained on synthetic training data, i.e., +10% classification accuracy improvements on average. + +
+
+ comment: 20 pages, 1 figure, 10 tables +
+
+
+
+
+ + ☆ Does Visual Pretraining Help End-to-End Reasoning? + + +
+ We aim to investigate whether end-to-end learning of visual reasoning can be +achieved with general-purpose neural networks, with the help of visual +pretraining. A positive result would refute the common belief that explicit +visual abstraction (e.g. object detection) is essential for compositional +generalization on visual reasoning, and confirm the feasibility of a neural +network "generalist" to solve visual recognition and reasoning tasks. We +propose a simple and general self-supervised framework which "compresses" each +video frame into a small set of tokens with a transformer network, and +reconstructs the remaining frames based on the compressed temporal context. To +minimize the reconstruction loss, the network must learn a compact +representation for each image, as well as capture temporal dynamics and object +permanence from temporal context. We perform evaluation on two visual reasoning +benchmarks, CATER and ACRE. We observe that pretraining is essential to achieve +compositional generalization for end-to-end visual reasoning. Our proposed +framework outperforms traditional supervised pretraining, including image +classification and explicit object detection, by large margins. + +
+
+
+
+
+ + ☆ BUS:Efficient and Effective Vision-language Pre-training with Bottom-Up + Patch Summarization ICCV2023 + + +
+ Vision Transformer (ViT) based Vision-Language Pre-training (VLP) models have +demonstrated impressive performance in various tasks. However, the lengthy +visual token sequences fed into ViT can lead to training inefficiency and +ineffectiveness. Existing efforts address the challenge by either bottom-level +patch extraction in the ViT backbone or top-level patch abstraction outside, +not balancing training efficiency and effectiveness well. Inspired by text +summarization in natural language processing, we propose a Bottom-Up Patch +Summarization approach named BUS, coordinating bottom-level extraction and +top-level abstraction to learn a concise summary of lengthy visual token +sequences efficiently. Specifically, We incorporate a Text-Semantics-Aware +Patch Selector (TSPS) into the ViT backbone to perform a coarse-grained visual +token extraction and then attach a flexible Transformer-based Patch Abstraction +Decoder (PAD) upon the backbone for top-level visual abstraction. This +bottom-up collaboration enables our BUS to yield high training efficiency while +maintaining or even improving effectiveness. We evaluate our approach on +various visual-language understanding and generation tasks and show competitive +downstream task performance while boosting the training efficiency by 50\%. +Additionally, our model achieves state-of-the-art performance on many +downstream tasks by increasing input image resolution without increasing +computational costs over baselines. + +
+
+ comment: Accepted on ICCV2023 +
+
+
+
+
+ + ☆ Cumulative Spatial Knowledge Distillation for Vision Transformers ICCV 2023 + + +
+ Distilling knowledge from convolutional neural networks (CNNs) is a +double-edged sword for vision transformers (ViTs). It boosts the performance +since the image-friendly local-inductive bias of CNN helps ViT learn faster and +better, but leading to two problems: (1) Network designs of CNN and ViT are +completely different, which leads to different semantic levels of intermediate +features, making spatial-wise knowledge transfer methods (e.g., feature +mimicking) inefficient. (2) Distilling knowledge from CNN limits the network +convergence in the later training period since ViT's capability of integrating +global information is suppressed by CNN's local-inductive-bias supervision. To +this end, we present Cumulative Spatial Knowledge Distillation (CSKD). CSKD +distills spatial-wise knowledge to all patch tokens of ViT from the +corresponding spatial responses of CNN, without introducing intermediate +features. Furthermore, CSKD exploits a Cumulative Knowledge Fusion (CKF) +module, which introduces the global response of CNN and increasingly emphasizes +its importance during the training. Applying CKF leverages CNN's local +inductive bias in the early training period and gives full play to ViT's global +capability in the later one. Extensive experiments and analysis on ImageNet-1k +and downstream datasets demonstrate the superiority of our CSKD. Code will be +publicly available. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ SVDFormer: Complementing Point Cloud via Self-view Augmentation and + Self-structure Dual-generator ICCV2023 + + +
+ In this paper, we propose a novel network, SVDFormer, to tackle two specific +challenges in point cloud completion: understanding faithful global shapes from +incomplete point clouds and generating high-accuracy local structures. Current +methods either perceive shape patterns using only 3D coordinates or import +extra images with well-calibrated intrinsic parameters to guide the geometry +estimation of the missing parts. However, these approaches do not always fully +leverage the cross-modal self-structures available for accurate and +high-quality point cloud completion. To this end, we first design a Self-view +Fusion Network that leverages multiple-view depth image information to observe +incomplete self-shape and generate a compact global shape. To reveal highly +detailed structures, we then introduce a refinement module, called +Self-structure Dual-generator, in which we incorporate learned shape priors and +geometric self-similarities for producing new points. By perceiving the +incompleteness of each point, the dual-path design disentangles refinement +strategies conditioned on the structural type of each point. SVDFormer absorbs +the wisdom of self-structures, avoiding any additional paired information such +as color images with precisely calibrated camera intrinsic parameters. +Comprehensive experiments indicate that our method achieves state-of-the-art +performance on widely-used benchmarks. Code will be available at +https://github.com/czvvd/SVDFormer. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Differentiable Transportation Pruning ICCV 2023 + + +
+ Deep learning algorithms are increasingly employed at the edge. However, edge +devices are resource constrained and thus require efficient deployment of deep +neural networks. Pruning methods are a key tool for edge deployment as they can +improve storage, compute, memory bandwidth, and energy usage. In this paper we +propose a novel accurate pruning technique that allows precise control over the +output network size. Our method uses an efficient optimal transportation scheme +which we make end-to-end differentiable and which automatically tunes the +exploration-exploitation behavior of the algorithm to find accurate sparse +sub-networks. We show that our method achieves state-of-the-art performance +compared to previous pruning methods on 3 different datasets, using 5 different +models, across a wide range of pruning ratios, and with two types of sparsity +budgets and pruning granularities. + +
+
+ comment: ICCV 2023. arXiv admin note: text overlap with arXiv:2002.10179 by + other authors +
+
+
+
+
+ + ☆ SkeletonMAE: Graph-based Masked Autoencoder for Skeleton Sequence + Pre-training ICCV 2023 + + +
+ Skeleton sequence representation learning has shown great advantages for +action recognition due to its promising ability to model human joints and +topology. However, the current methods usually require sufficient labeled data +for training computationally expensive models, which is labor-intensive and +time-consuming. Moreover, these methods ignore how to utilize the fine-grained +dependencies among different skeleton joints to pre-train an efficient skeleton +sequence learning model that can generalize well across different datasets. In +this paper, we propose an efficient skeleton sequence learning framework, named +Skeleton Sequence Learning (SSL). To comprehensively capture the human pose and +obtain discriminative skeleton sequence representation, we build an asymmetric +graph-based encoder-decoder pre-training architecture named SkeletonMAE, which +embeds skeleton joint sequence into Graph Convolutional Network (GCN) and +reconstructs the masked skeleton joints and edges based on the prior human +topology knowledge. Then, the pre-trained SkeletonMAE encoder is integrated +with the Spatial-Temporal Representation Learning (STRL) module to build the +SSL framework. Extensive experimental results show that our SSL generalizes +well across different datasets and outperforms the state-of-the-art +self-supervised skeleton-based action recognition methods on FineGym, Diving48, +NTU 60 and NTU 120 datasets. Additionally, we obtain comparable performance to +some fully supervised methods. The code is avaliable at +https://github.com/HongYan1123/SkeletonMAE. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ EGE-UNet: an Efficient Group Enhanced UNet for skin lesion segmentation MICCAI 2023 + + +
+ Transformer and its variants have been widely used for medical image +segmentation. However, the large number of parameter and computational load of +these models make them unsuitable for mobile health applications. To address +this issue, we propose a more efficient approach, the Efficient Group Enhanced +UNet (EGE-UNet). We incorporate a Group multi-axis Hadamard Product Attention +module (GHPA) and a Group Aggregation Bridge module (GAB) in a lightweight +manner. The GHPA groups input features and performs Hadamard Product Attention +mechanism (HPA) on different axes to extract pathological information from +diverse perspectives. The GAB effectively fuses multi-scale information by +grouping low-level features, high-level features, and a mask generated by the +decoder at each stage. Comprehensive experiments on the ISIC2017 and ISIC2018 +datasets demonstrate that EGE-UNet outperforms existing state-of-the-art +methods. In short, compared to the TransFuse, our model achieves superior +segmentation performance while reducing parameter and computation costs by 494x +and 160x, respectively. Moreover, to our best knowledge, this is the first +model with a parameter count limited to just 50KB. Our code is available at +https://github.com/JCruan519/EGE-UNet. + +
+
+ comment: 10 pages, 4 figures, 2 tables. This paper has been early accepted by + MICCAI 2023 and has received the MICCAI Student-Author Registration (STAR) + Award +
+
+
+
+
+ + ☆ Riesz feature representation: scale equivariant scattering network for + classification tasks + + +
+ Scattering networks yield powerful and robust hierarchical image descriptors +which do not require lengthy training and which work well with very few +training data. However, they rely on sampling the scale dimension. Hence, they +become sensitive to scale variations and are unable to generalize to unseen +scales. In this work, we define an alternative feature representation based on +the Riesz transform. We detail and analyze the mathematical foundations behind +this representation. In particular, it inherits scale equivariance from the +Riesz transform and completely avoids sampling of the scale dimension. +Additionally, the number of features in the representation is reduced by a +factor four compared to scattering networks. Nevertheless, our representation +performs comparably well for texture classification with an interesting +addition: scale equivariance. Our method yields superior performance when +dealing with scales outside of those covered by the training dataset. The +usefulness of the equivariance property is demonstrated on the digit +classification task, where accuracy remains stable even for scales four times +larger than the one chosen for training. As a second example, we consider +classification of textures. + +
+
+
+
+
+ + ☆ Classification of UHF Partial Discharge Signals in Gas-Insulated HVDC + Systems Using Neural Networks + + +
+ Undetected partial discharges (PDs) are a safety critical issue in high +voltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC +voltage is well-established, the analysis of PDs under DC voltage remains an +active research field. A key focus of these investigations is the +classification of different PD sources to enable subsequent sophisticated +analysis. + In this paper, we propose and analyze a neural network-based approach for +classifying PD signals caused by metallic protrusions and conductive particles +on the insulator of HVDC GIS, without relying on pulse sequence analysis +features. In contrast to previous approaches, our proposed model can +discriminate the studied PD signals obtained at negative and positive +potentials, while also generalizing to unseen operating voltage multiples. +Additionally, we compare the performance of time- and frequency-domain input +signals and explore the impact of different normalization schemes to mitigate +the influence of free-space path loss between the sensor and defect location. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Domain Adaptation using Silver Standard Masks for Lateral Ventricle + Segmentation in FLAIR MRI + + +
+ Lateral ventricular volume (LVV) is an important biomarker for clinical +investigation. We present the first transfer learning-based LVV segmentation +method for fluid-attenuated inversion recovery (FLAIR) MRI. To mitigate +covariate shifts between source and target domains, this work proposes an +domain adaptation method that optimizes performance on three target datasets. +Silver standard (SS) masks were generated from the target domain using a novel +conventional image processing ventricular segmentation algorithm and used to +supplement the gold standard (GS) data from the source domain, Canadian +Atherosclerosis Imaging Network (CAIN). Four models were tested on held-out +test sets from four datasets: 1) SS+GS: trained on target SS masks and +fine-tuned on source GS masks, 2) GS+SS: trained on source GS masks and +fine-tuned on target SS masks, 3) trained on source GS (GS CAIN Only) and 4) +trained on target SS masks (SS Only). The SS+GS model had the best and most +consistent performance (mean DSC = 0.89, CoV = 0.05) and showed significantly +(p < 0.05) higher DSC compared to the GS-only model on three target domains. +Results suggest pre-training with noisy labels from the target domain allows +the model to adapt to the dataset-specific characteristics and provides robust +parameter initialization while fine-tuning with GS masks allows the model to +learn detailed features. This method has wide application to other medical +imaging problems where labeled data is scarce, and can be used as a per-dataset +calibration method to accelerate wide-scale adoption. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ Not All Steps are Created Equal: Selective Diffusion Distillation for + Image Manipulation + + +
+ Conditional diffusion models have demonstrated impressive performance in +image manipulation tasks. The general pipeline involves adding noise to the +image and then denoising it. However, this method faces a trade-off problem: +adding too much noise affects the fidelity of the image while adding too little +affects its editability. This largely limits their practical applicability. In +this paper, we propose a novel framework, Selective Diffusion Distillation +(SDD), that ensures both the fidelity and editability of images. Instead of +directly editing images with a diffusion model, we train a feedforward image +manipulation network under the guidance of the diffusion model. Besides, we +propose an effective indicator to select the semantic-related timestep to +obtain the correct semantic guidance from the diffusion model. This approach +successfully avoids the dilemma caused by the diffusion process. Our extensive +experiments demonstrate the advantages of our framework. Code is released at +https://github.com/AndysonYs/Selective-Diffusion-Distillation. + +
+
+
+
+
+ + ☆ DOT: A Distillation-Oriented Trainer ICCV 2023 + + +
+ Knowledge distillation transfers knowledge from a large model to a small one +via task and distillation losses. In this paper, we observe a trade-off between +task and distillation losses, i.e., introducing distillation loss limits the +convergence of task loss. We believe that the trade-off results from the +insufficient optimization of distillation loss. The reason is: The teacher has +a lower task loss than the student, and a lower distillation loss drives the +student more similar to the teacher, then a better-converged task loss could be +obtained. To break the trade-off, we propose the Distillation-Oriented Trainer +(DOT). DOT separately considers gradients of task and distillation losses, then +applies a larger momentum to distillation loss to accelerate its optimization. +We empirically prove that DOT breaks the trade-off, i.e., both losses are +sufficiently optimized. Extensive experiments validate the superiority of DOT. +Notably, DOT achieves a +2.59% accuracy improvement on ImageNet-1k for the +ResNet50-MobileNetV1 pair. Conclusively, DOT greatly benefits the student's +optimization properties in terms of loss convergence and model generalization. +Code will be made publicly available. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Dense Affinity Matching for Few-Shot Segmentation + + +
+ Few-Shot Segmentation (FSS) aims to segment the novel class images with a few +annotated samples. In this paper, we propose a dense affinity matching (DAM) +framework to exploit the support-query interaction by densely capturing both +the pixel-to-pixel and pixel-to-patch relations in each support-query pair with +the bidirectional 3D convolutions. Different from the existing methods that +remove the support background, we design a hysteretic spatial filtering module +(HSFM) to filter the background-related query features and retain the +foreground-related query features with the assistance of the support +background, which is beneficial for eliminating interference objects in the +query background. We comprehensively evaluate our DAM on ten benchmarks under +cross-category, cross-dataset, and cross-domain FSS tasks. Experimental results +demonstrate that DAM performs very competitively under different settings with +only 0.68M parameters, especially under cross-domain FSS tasks, showing its +effectiveness and efficiency. + +
+
+
+
+
+ + ☆ Divide&Classify: Fine-Grained Classification for City-Wide Visual Place + Recognition ICCV23 + + +
+ Visual Place recognition is commonly addressed as an image retrieval problem. +However, retrieval methods are impractical to scale to large datasets, densely +sampled from city-wide maps, since their dimension impact negatively on the +inference time. Using approximate nearest neighbour search for retrieval helps +to mitigate this issue, at the cost of a performance drop. In this paper we +investigate whether we can effectively approach this task as a classification +problem, thus bypassing the need for a similarity search. We find that existing +classification methods for coarse, planet-wide localization are not suitable +for the fine-grained and city-wide setting. This is largely due to how the +dataset is split into classes, because these methods are designed to handle a +sparse distribution of photos and as such do not consider the visual aliasing +problem across neighbouring classes that naturally arises in dense scenarios. +Thus, we propose a partitioning scheme that enables a fast and accurate +inference, preserving a simple learning procedure, and a novel inference +pipeline based on an ensemble of novel classifiers that uses the prototypes +learned via an angular margin loss. Our method, Divide&Classify (D&C), enjoys +the fast inference of classification solutions and an accuracy competitive with +retrieval methods on the fine-grained, city-wide setting. Moreover, we show +that D&C can be paired with existing retrieval pipelines to speed up +computations by over 20 times while increasing their recall, leading to new +state-of-the-art results. + +
+
+ comment: Accepted to ICCV23 +
+
+
+
+
+ + ☆ Monocular 3D Object Detection with LiDAR Guided Semi Supervised Active + Learning + + +
+ We propose a novel semi-supervised active learning (SSAL) framework for +monocular 3D object detection with LiDAR guidance (MonoLiG), which leverages +all modalities of collected data during model development. We utilize LiDAR to +guide the data selection and training of monocular 3D detectors without +introducing any overhead in the inference phase. During training, we leverage +the LiDAR teacher, monocular student cross-modal framework from semi-supervised +learning to distill information from unlabeled data as pseudo-labels. To handle +the differences in sensor characteristics, we propose a data noise-based +weighting mechanism to reduce the effect of propagating noise from LiDAR +modality to monocular. For selecting which samples to label to improve the +model performance, we propose a sensor consistency-based selection score that +is also coherent with the training objective. Extensive experimental results on +KITTI and Waymo datasets verify the effectiveness of our proposed framework. In +particular, our selection strategy consistently outperforms state-of-the-art +active learning baselines, yielding up to 17% better saving rate in labeling +costs. Our training strategy attains the top place in KITTI 3D and +birds-eye-view (BEV) monocular object detection official benchmarks by +improving the BEV Average Precision (AP) by 2.02. + +
+
+
+
+
+ + ☆ Active Learning for Object Detection with Non-Redundant Informative + Sampling + + +
+ Curating an informative and representative dataset is essential for enhancing +the performance of 2D object detectors. We present a novel active learning +sampling strategy that addresses both the informativeness and diversity of the +selections. Our strategy integrates uncertainty and diversity-based selection +principles into a joint selection objective by measuring the collective +information score of the selected samples. Specifically, our proposed NORIS +algorithm quantifies the impact of training with a sample on the +informativeness of other similar samples. By exclusively selecting samples that +are simultaneously informative and distant from other highly informative +samples, we effectively avoid redundancy while maintaining a high level of +informativeness. Moreover, instead of utilizing whole image features to +calculate distances between samples, we leverage features extracted from +detected object regions within images to define object features. This allows us +to construct a dataset encompassing diverse object types, shapes, and angles. +Extensive experiments on object detection and image classification tasks +demonstrate the effectiveness of our strategy over the state-of-the-art +baselines. Specifically, our selection strategy achieves a 20% and 30% +reduction in labeling costs compared to random selection for PASCAL-VOC and +KITTI, respectively. + +
+
+
+
+
+ + ☆ CLIP-Guided StyleGAN Inversion for Text-Driven Real Image Editing + + +
+ Researchers have recently begun exploring the use of StyleGAN-based models +for real image editing. One particularly interesting application is using +natural language descriptions to guide the editing process. Existing approaches +for editing images using language either resort to instance-level latent code +optimization or map predefined text prompts to some editing directions in the +latent space. However, these approaches have inherent limitations. The former +is not very efficient, while the latter often struggles to effectively handle +multi-attribute changes. To address these weaknesses, we present CLIPInverter, +a new text-driven image editing approach that is able to efficiently and +reliably perform multi-attribute changes. The core of our method is the use of +novel, lightweight text-conditioned adapter layers integrated into pretrained +GAN-inversion networks. We demonstrate that by conditioning the initial +inversion step on the CLIP embedding of the target description, we are able to +obtain more successful edit directions. Additionally, we use a CLIP-guided +refinement step to make corrections in the resulting residual latent codes, +which further improves the alignment with the text prompt. Our method +outperforms competing approaches in terms of manipulation accuracy and +photo-realism on various domains including human faces, cats, and birds, as +shown by our qualitative and quantitative results. + +
+
+ comment: Accepted for publication in ACM Transactions on Graphics +
+
+
+
+
+ + ☆ Dynamic Snake Convolution based on Topological Geometric Constraints for + Tubular Structure Segmentation ICCV 2023 + + +
+ Accurate segmentation of topological tubular structures, such as blood +vessels and roads, is crucial in various fields, ensuring accuracy and +efficiency in downstream tasks. However, many factors complicate the task, +including thin local structures and variable global morphologies. In this work, +we note the specificity of tubular structures and use this knowledge to guide +our DSCNet to simultaneously enhance perception in three stages: feature +extraction, feature fusion, and loss constraint. First, we propose a dynamic +snake convolution to accurately capture the features of tubular structures by +adaptively focusing on slender and tortuous local structures. Subsequently, we +propose a multi-view feature fusion strategy to complement the attention to +features from multiple perspectives during feature fusion, ensuring the +retention of important information from different global morphologies. Finally, +a continuity constraint loss function, based on persistent homology, is +proposed to constrain the topological continuity of the segmentation better. +Experiments on 2D and 3D datasets show that our DSCNet provides better accuracy +and continuity on the tubular structure segmentation task compared with several +methods. Our codes will be publicly available. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Distributed bundle adjustment with block-based sparse matrix compression + for super large scale datasets ICCV2023 + + +
+ We propose a distributed bundle adjustment (DBA) method using the exact +Levenberg-Marquardt (LM) algorithm for super large-scale datasets. Most of the +existing methods partition the global map to small ones and conduct bundle +adjustment in the submaps. In order to fit the parallel framework, they use +approximate solutions instead of the LM algorithm. However, those methods often +give sub-optimal results. Different from them, we utilize the exact LM +algorithm to conduct global bundle adjustment where the formation of the +reduced camera system (RCS) is actually parallelized and executed in a +distributed way. To store the large RCS, we compress it with a block-based +sparse matrix compression format (BSMC), which fully exploits its block +feature. The BSMC format also enables the distributed storage and updating of +the global RCS. The proposed method is extensively evaluated and compared with +the state-of-the-art pipelines using both synthetic and real datasets. +Preliminary results demonstrate the efficient memory usage and vast scalability +of the proposed method compared with the baselines. For the first time, we +conducted parallel bundle adjustment using LM algorithm on a real datasets with +1.18 million images and a synthetic dataset with 10 million images (about 500 +times that of the state-of-the-art LM-based BA) on a distributed computing +system. + +
+
+ comment: accepted by ICCV2023 +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation: Let's Talk About The Weather ICCV'23 + + +
+ Current, self-supervised depth estimation architectures rely on clear and +sunny weather scenes to train deep neural networks. However, in many locations, +this assumption is too strong. For example in the UK (2021), 149 days consisted +of rain. For these architectures to be effective in real-world applications, we +must create models that can generalise to all weather conditions, times of the +day and image qualities. Using a combination of computer graphics and +generative models, one can augment existing sunny-weather data in a variety of +ways that simulate adverse weather effects. While it is tempting to use such +data augmentations for self-supervised depth, in the past this was shown to +degrade performance instead of improving it. In this paper, we put forward a +method that uses augmentations to remedy this problem. By exploiting the +correspondence between unaugmented and augmented data we introduce a +pseudo-supervised loss for both depth and pose estimation. This brings back +some of the benefits of supervised learning while still not requiring any +labels. We also make a series of practical recommendations which collectively +offer a reliable, efficient framework for weather-related augmentation of +self-supervised depth from monocular video. We present extensive testing to +show that our method, Robust-Depth, achieves SotA performance on the KITTI +dataset while significantly surpassing SotA on challenging, adverse condition +data such as DrivingStereo, Foggy CityScape and NuScenes-Night. The project +website can be found here https://kieran514.github.io/Robust-Depth-Project/. + +
+
+ comment: ICCV'23 +
+
+
+
+
+ + ☆ Box-DETR: Understanding and Boxing Conditional Spatial Queries + + +
+ Conditional spatial queries are recently introduced into DEtection +TRansformer (DETR) to accelerate convergence. In DAB-DETR, such queries are +modulated by the so-called conditional linear projection at each decoder stage, +aiming to search for positions of interest such as the four extremities of the +box. Each decoder stage progressively updates the box by predicting the anchor +box offsets, while in cross-attention only the box center is informed as the +reference point. The use of only box center, however, leaves the width and +height of the previous box unknown to the current stage, which hinders accurate +prediction of offsets. We argue that the explicit use of the entire box +information in cross-attention matters. In this work, we propose Box Agent to +condense the box into head-specific agent points. By replacing the box center +with the agent point as the reference point in each head, the conditional +cross-attention can search for positions from a more reasonable starting point +by considering the full scope of the previous box, rather than always from the +previous box center. This significantly reduces the burden of the conditional +linear projection. Experimental results show that the box agent leads to not +only faster convergence but also improved detection performance, e.g., our +single-scale model achieves $44.2$ AP with ResNet-50 based on DAB-DETR. Our Box +Agent requires minor modifications to the code and has negligible computational +workload. Code is available at https://github.com/tiny-smart/box-detr. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Neural Modulation Fields for Conditional Cone Beam Neural Tomography + + +
+ Conventional Computed Tomography (CT) methods require large numbers of +noise-free projections for accurate density reconstructions, limiting their +applicability to the more complex class of Cone Beam Geometry CT (CBCT) +reconstruction. Recently, deep learning methods have been proposed to overcome +these limitations, with methods based on neural fields (NF) showing strong +performance, by approximating the reconstructed density through a +continuous-in-space coordinate based neural network. Our focus is on improving +such methods, however, unlike previous work, which requires training an NF from +scratch for each new set of projections, we instead propose to leverage +anatomical consistencies over different scans by training a single conditional +NF on a dataset of projections. We propose a novel conditioning method where +local modulations are modeled per patient as a field over the input domain +through a Neural Modulation Field (NMF). The resulting Conditional Cone Beam +Neural Tomography (CondCBNT) shows improved performance for both high and low +numbers of available projections on noise-free and noisy data. + +
+
+
+
+
+ + ☆ Adaptive Local Basis Functions for Shape Completion SIGGRAPH 2023 + + +
+ In this paper, we focus on the task of 3D shape completion from partial point +clouds using deep implicit functions. Existing methods seek to use voxelized +basis functions or the ones from a certain family of functions (e.g., +Gaussians), which leads to high computational costs or limited shape +expressivity. On the contrary, our method employs adaptive local basis +functions, which are learned end-to-end and not restricted in certain forms. +Based on those basis functions, a local-to-local shape completion framework is +presented. Our algorithm learns sparse parameterization with a small number of +basis functions while preserving local geometric details during completion. +Quantitative and qualitative experiments demonstrate that our method +outperforms the state-of-the-art methods in shape completion, detail +preservation, generalization to unseen geometries, and computational cost. Code +and data are at https://github.com/yinghdb/Adaptive-Local-Basis-Functions. + +
+
+ comment: In SIGGRAPH 2023 +
+
+
+
+
+ + ☆ M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models + and Latent Space Geometry Optimization + + +
+ Medical vision-language models enable co-learning and integrating features +from medical imaging and clinical text. However, these models are not easy to +train and the latent representation space can be complex. Here we propose a +novel way for pre-training and regularising medical vision-language models. The +proposed method, named Medical vision-language pre-training with Frozen +language models and Latent spAce Geometry optimization (M-FLAG), leverages a +frozen language model for training stability and efficiency and introduces a +novel orthogonality loss to harmonize the latent space geometry. We demonstrate +the potential of the pre-trained model on three downstream tasks: medical image +classification, segmentation, and object detection. Extensive experiments +across five public datasets demonstrate that M-FLAG significantly outperforms +existing medical vision-language pre-training approaches and reduces the number +of parameters by 78\%. Notably, M-FLAG achieves outstanding performance on the +segmentation task while using only 1\% of the RSNA dataset, even outperforming +ImageNet pre-trained models that have been fine-tuned using 100\% of the data. + +
+
+
+
+
+ + ☆ Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection SC 2023 + + +
+ Accurate and robust object detection is critical for autonomous driving. +Image-based detectors face difficulties caused by low visibility in adverse +weather conditions. Thus, radar-camera fusion is of particular interest but +presents challenges in optimally fusing heterogeneous data sources. To approach +this issue, we propose two new radar preprocessing techniques to better align +radar and camera data. In addition, we introduce a Multi-Task Cross-Modality +Attention-Fusion Network (MCAF-Net) for object detection, which includes two +new fusion blocks. These allow for exploiting information from the feature maps +more comprehensively. The proposed algorithm jointly detects objects and +segments free space, which guides the model to focus on the more relevant part +of the scene, namely, the occupied space. Our approach outperforms current +state-of-the-art radar-camera fusion-based object detectors in the nuScenes +dataset and achieves more robust results in adverse weather conditions and +nighttime scenarios. + +
+
+ comment: Accepted by ITSC 2023 +
+
+
+
+
+ + ☆ Soft Curriculum for Learning Conditional GANs with Noisy-Labeled and + Uncurated Unlabeled Data + + +
+ Label-noise or curated unlabeled data is used to compensate for the +assumption of clean labeled data in training the conditional generative +adversarial network; however, satisfying such an extended assumption is +occasionally laborious or impractical. As a step towards generative modeling +accessible to everyone, we introduce a novel conditional image generation +framework that accepts noisy-labeled and uncurated unlabeled data during +training: (i) closed-set and open-set label noise in labeled data and (ii) +closed-set and open-set unlabeled data. To combat it, we propose soft +curriculum learning, which assigns instance-wise weights for adversarial +training while assigning new labels for unlabeled data and correcting wrong +labels for labeled data. Unlike popular curriculum learning, which uses a +threshold to pick the training samples, our soft curriculum controls the effect +of each training instance by using the weights predicted by the auxiliary +classifier, resulting in the preservation of useful samples while ignoring +harmful ones. Our experiments show that our approach outperforms existing +semi-supervised and label-noise robust methods in terms of both quantitative +and qualitative performance. In particular, the proposed approach is able to +match the performance of (semi-) supervised GANs even with less than half the +labeled data. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ☆ Airway Label Prediction in Video Bronchoscopy: Capturing Temporal + Dependencies Utilizing Anatomical Knowledge + + +
+ Purpose: Navigation guidance is a key requirement for a multitude of lung +interventions using video bronchoscopy. State-of-the-art solutions focus on +lung biopsies using electromagnetic tracking and intraoperative image +registration w.r.t. preoperative CT scans for guidance. The requirement of +patient-specific CT scans hampers the utilisation of navigation guidance for +other applications such as intensive care units. + Methods: This paper addresses navigation guidance solely incorporating +bronchosopy video data. In contrast to state-of-the-art approaches we entirely +omit the use of electromagnetic tracking and patient-specific CT scans. +Guidance is enabled by means of topological bronchoscope localization w.r.t. an +interpatient airway model. Particularly, we take maximally advantage of +anatomical constraints of airway trees being sequentially traversed. This is +realized by incorporating sequences of CNN-based airway likelihoods into a +Hidden Markov Model. + Results: Our approach is evaluated based on multiple experiments inside a +lung phantom model. With the consideration of temporal context and use of +anatomical knowledge for regularization, we are able to improve the accuracy up +to to 0.98 compared to 0.81 (weighted F1: 0.98 compared to 0.81) for a +classification based on individual frames. + Conclusion: We combine CNN-based single image classification of airway +segments with anatomical constraints and temporal HMM-based inference for the +first time. Our approach renders vision-only guidance for bronchoscopy +interventions in the absence of electromagnetic tracking and patient-specific +CT scans possible. + +
+
+ comment: Submitted to International Journal of Computer Assisted Radiology and + Surgery +
+
+
+
+
+ + ☆ AltFreezing for More General Video Face Forgery Detection CVPR 2023 + + +
+ Existing face forgery detection models try to discriminate fake images by +detecting only spatial artifacts (e.g., generative artifacts, blending) or +mainly temporal artifacts (e.g., flickering, discontinuity). They may +experience significant performance degradation when facing out-domain +artifacts. In this paper, we propose to capture both spatial and temporal +artifacts in one model for face forgery detection. A simple idea is to leverage +a spatiotemporal model (3D ConvNet). However, we find that it may easily rely +on one type of artifact and ignore the other. To address this issue, we present +a novel training strategy called AltFreezing for more general face forgery +detection. The AltFreezing aims to encourage the model to detect both spatial +and temporal artifacts. It divides the weights of a spatiotemporal network into +two groups: spatial-related and temporal-related. Then the two groups of +weights are alternately frozen during the training process so that the model +can learn spatial and temporal features to distinguish real or fake videos. +Furthermore, we introduce various video-level data augmentation methods to +improve the generalization capability of the forgery detection model. Extensive +experiments show that our framework outperforms existing methods in terms of +generalization to unseen manipulations and datasets. Code is available at +https: //github.com/ZhendongWang6/AltFreezing. + +
+
+ comment: Accepted by CVPR 2023 Highlight, code and models are available at + https: //github.com/ZhendongWang6/AltFreezing +
+
+
+
+
+ + ☆ Bridging the Gap: Multi-Level Cross-Modality Joint Alignment for + Visible-Infrared Person Re-Identification + + +
+ Visible-Infrared person Re-IDentification (VI-ReID) is a challenging +cross-modality image retrieval task that aims to match pedestrians' images +across visible and infrared cameras. To solve the modality gap, existing +mainstream methods adopt a learning paradigm converting the image retrieval +task into an image classification task with cross-entropy loss and auxiliary +metric learning losses. These losses follow the strategy of adjusting the +distribution of extracted embeddings to reduce the intra-class distance and +increase the inter-class distance. However, such objectives do not precisely +correspond to the final test setting of the retrieval task, resulting in a new +gap at the optimization level. By rethinking these keys of VI-ReID, we propose +a simple and effective method, the Multi-level Cross-modality Joint Alignment +(MCJA), bridging both modality and objective-level gap. For the former, we +design the Modality Alignment Augmentation, which consists of three novel +strategies, the weighted grayscale, cross-channel cutmix, and spectrum jitter +augmentation, effectively reducing modality discrepancy in the image space. For +the latter, we introduce a new Cross-Modality Retrieval loss. It is the first +work to constrain from the perspective of the ranking list, aligning with the +goal of the testing stage. Moreover, based on the global feature only, our +method exhibits good performance and can serve as a strong baseline method for +the VI-ReID community. + +
+
+ comment: 10 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ A Novel Multi-Task Model Imitating Dermatologists for Accurate + Differential Diagnosis of Skin Diseases in Clinical Images MICCAI 2023 + + +
+ Skin diseases are among the most prevalent health issues, and accurate +computer-aided diagnosis methods are of importance for both dermatologists and +patients. However, most of the existing methods overlook the essential domain +knowledge required for skin disease diagnosis. A novel multi-task model, namely +DermImitFormer, is proposed to fill this gap by imitating dermatologists' +diagnostic procedures and strategies. Through multi-task learning, the model +simultaneously predicts body parts and lesion attributes in addition to the +disease itself, enhancing diagnosis accuracy and improving diagnosis +interpretability. The designed lesion selection module mimics dermatologists' +zoom-in action, effectively highlighting the local lesion features from noisy +backgrounds. Additionally, the presented cross-interaction module explicitly +models the complicated diagnostic reasoning between body parts, lesion +attributes, and diseases. To provide a more robust evaluation of the proposed +method, a large-scale clinical image dataset of skin diseases with +significantly more cases than existing datasets has been established. Extensive +experiments on three different datasets consistently demonstrate the +state-of-the-art recognition performance of the proposed approach. + +
+
+ comment: MICCAI 2023 early accept +
+
+
+
+
+ + ☆ ShiftNAS: Improving One-shot NAS via Probability Shift + + +
+ One-shot Neural architecture search (One-shot NAS) has been proposed as a +time-efficient approach to obtain optimal subnet architectures and weights +under different complexity cases by training only once. However, the subnet +performance obtained by weight sharing is often inferior to the performance +achieved by retraining. In this paper, we investigate the performance gap and +attribute it to the use of uniform sampling, which is a common approach in +supernet training. Uniform sampling concentrates training resources on subnets +with intermediate computational resources, which are sampled with high +probability. However, subnets with different complexity regions require +different optimal training strategies for optimal performance. To address the +problem of uniform sampling, we propose ShiftNAS, a method that can adjust the +sampling probability based on the complexity of subnets. We achieve this by +evaluating the performance variation of subnets with different complexity and +designing an architecture generator that can accurately and efficiently provide +subnets with the desired complexity. Both the sampling probability and the +architecture generator can be trained end-to-end in a gradient-based manner. +With ShiftNAS, we can directly obtain the optimal model architecture and +parameters for a given computational complexity. We evaluate our approach on +multiple visual network models, including convolutional neural networks (CNNs) +and vision transformers (ViTs), and demonstrate that ShiftNAS is +model-agnostic. Experimental results on ImageNet show that ShiftNAS can improve +the performance of one-shot NAS without additional consumption. Source codes +are available at https://github.com/bestfleer/ShiftNAS. + +
+
+ comment: accepted by iccv 2023 +
+
+
+
+
+ + ☆ Going Beyond Linear Mode Connectivity: The Layerwise Linear Feature + Connectivity + + +
+ Recent work has revealed many intriguing empirical phenomena in neural +network training, despite the poorly understood and highly complex loss +landscapes and training dynamics. One of these phenomena, Linear Mode +Connectivity (LMC), has gained considerable attention due to the intriguing +observation that different solutions can be connected by a linear path in the +parameter space while maintaining near-constant training and test losses. In +this work, we introduce a stronger notion of linear connectivity, Layerwise +Linear Feature Connectivity (LLFC), which says that the feature maps of every +layer in different trained networks are also linearly connected. We provide +comprehensive empirical evidence for LLFC across a wide range of settings, +demonstrating that whenever two trained networks satisfy LMC (via either +spawning or permutation methods), they also satisfy LLFC in nearly all the +layers. Furthermore, we delve deeper into the underlying factors contributing +to LLFC, which reveal new insights into the spawning and permutation +approaches. The study of LLFC transcends and advances our understanding of LMC +by adopting a feature-learning perspective. + +
+
+ comment: 25 pages, 23 figures +
+
+
+
+
+ + ☆ Combiner and HyperCombiner Networks: Rules to Combine Multimodality MR + Images for Prostate Cancer Localisation + + +
+ One of the distinct characteristics in radiologists' reading of +multiparametric prostate MR scans, using reporting systems such as PI-RADS +v2.1, is to score individual types of MR modalities, T2-weighted, +diffusion-weighted, and dynamic contrast-enhanced, and then combine these +image-modality-specific scores using standardised decision rules to predict the +likelihood of clinically significant cancer. This work aims to demonstrate that +it is feasible for low-dimensional parametric models to model such decision +rules in the proposed Combiner networks, without compromising the accuracy of +predicting radiologic labels: First, it is shown that either a linear mixture +model or a nonlinear stacking model is sufficient to model PI-RADS decision +rules for localising prostate cancer. Second, parameters of these (generalised) +linear models are proposed as hyperparameters, to weigh multiple networks that +independently represent individual image modalities in the Combiner network +training, as opposed to end-to-end modality ensemble. A HyperCombiner network +is developed to train a single image segmentation network that can be +conditioned on these hyperparameters during inference, for much improved +efficiency. Experimental results based on data from 850 patients, for the +application of automating radiologist labelling multi-parametric MR, compare +the proposed combiner networks with other commonly-adopted end-to-end networks. +Using the added advantages of obtaining and interpreting the modality combining +rules, in terms of the linear weights or odds-ratios on individual image +modalities, three clinical applications are presented for prostate cancer +segmentation, including modality availability assessment, importance +quantification and rule discovery. + +
+
+ comment: 48 pages, 6 figures +
+
+
+
+
+ + ☆ Adversarial Attacks on Traffic Sign Recognition: A Survey CEC + + +
+ Traffic sign recognition is an essential component of perception in +autonomous vehicles, which is currently performed almost exclusively with deep +neural networks (DNNs). However, DNNs are known to be vulnerable to adversarial +attacks. Several previous works have demonstrated the feasibility of +adversarial attacks on traffic sign recognition models. Traffic signs are +particularly promising for adversarial attack research due to the ease of +performing real-world attacks using printed signs or stickers. In this work, we +survey existing works performing either digital or real-world attacks on +traffic sign detection and classification models. We provide an overview of the +latest advancements and highlight the existing research areas that require +further investigation. + +
+
+ comment: Accepted for publication at ICECCME2023 +
+
+
+
+
+ + ☆ Liver Tumor Screening and Diagnosis in CT with Pixel-Lesion-Patient + Network MICCAI 2023 + + +
+ Liver tumor segmentation and classification are important tasks in computer +aided diagnosis. We aim to address three problems: liver tumor screening and +preliminary diagnosis in non-contrast computed tomography (CT), and +differential diagnosis in dynamic contrast-enhanced CT. A novel framework named +Pixel-Lesion-pAtient Network (PLAN) is proposed. It uses a mask transformer to +jointly segment and classify each lesion with improved anchor queries and a +foreground-enhanced sampling loss. It also has an image-wise classifier to +effectively aggregate global information and predict patient-level diagnosis. A +large-scale multi-phase dataset is collected containing 939 tumor patients and +810 normal subjects. 4010 tumor instances of eight types are extensively +annotated. On the non-contrast tumor screening task, PLAN achieves 95% and 96% +in patient-level sensitivity and specificity. On contrast-enhanced CT, our +lesion-level detection precision, recall, and classification accuracy are 92%, +89%, and 86%, outperforming widely used CNN and transformers for lesion +segmentation. We also conduct a reader study on a holdout set of 250 cases. +PLAN is on par with a senior human radiologist, showing the clinical +significance of our results. + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ☆ Extreme Image Compression using Fine-tuned VQGAN Models + + +
+ Recent advances in generative compression methods have demonstrated +remarkable progress in enhancing the perceptual quality of compressed data, +especially in scenarios with low bitrates. Nevertheless, their efficacy and +applicability in achieving extreme compression ratios ($<0.1$ bpp) still remain +constrained. In this work, we propose a simple yet effective coding framework +by introducing vector quantization (VQ)-based generative models into the image +compression domain. The main insight is that the codebook learned by the VQGAN +model yields strong expressive capacity, facilitating efficient compression of +continuous information in the latent space while maintaining reconstruction +quality. Specifically, an image can be represented as VQ-indices by finding the +nearest codeword, which can be encoded using lossless compression methods into +bitstreams. We then propose clustering a pre-trained large-scale codebook into +smaller codebooks using the K-means algorithm. This enables images to be +represented as diverse ranges of VQ-indices maps, resulting in variable +bitrates and different levels of reconstruction quality. Extensive qualitative +and quantitative experiments on various datasets demonstrate that the proposed +framework outperforms the state-of-the-art codecs in terms of perceptual +quality-oriented metrics and human perception under extremely low bitrates. + +
+
+ comment: Generative Compression, Extreme Compression, VQGANs, Low Bitrate +
+
+
+
+
+ + ☆ Hierarchical Spatiotemporal Transformers for Video Object Segmentation + + +
+ This paper presents a novel framework called HST for semi-supervised video +object segmentation (VOS). HST extracts image and video features using the +latest Swin Transformer and Video Swin Transformer to inherit their inductive +bias for the spatiotemporal locality, which is essential for temporally +coherent VOS. To take full advantage of the image and video features, HST casts +image and video features as a query and memory, respectively. By applying +efficient memory read operations at multiple scales, HST produces hierarchical +features for the precise reconstruction of object masks. HST shows +effectiveness and robustness in handling challenging scenarios with occluded +and fast-moving objects under cluttered backgrounds. In particular, HST-B +outperforms the state-of-the-art competitors on multiple popular benchmarks, +i.e., YouTube-VOS (85.0%), DAVIS 2017 (85.9%), and DAVIS 2016 (94.0%). + +
+
+
+
+
+ + ☆ Large-Scale Person Detection and Localization using Overhead Fisheye + Cameras ICCV 2023 + + +
+ Location determination finds wide applications in daily life. Instead of +existing efforts devoted to localizing tourist photos captured by perspective +cameras, in this article, we focus on devising person positioning solutions +using overhead fisheye cameras. Such solutions are advantageous in large field +of view (FOV), low cost, anti-occlusion, and unaggressive work mode (without +the necessity of cameras carried by persons). However, related studies are +quite scarce, due to the paucity of data. To stimulate research in this +exciting area, we present LOAF, the first large-scale overhead fisheye dataset +for person detection and localization. LOAF is built with many essential +features, e.g., i) the data cover abundant diversities in scenes, human pose, +density, and location; ii) it contains currently the largest number of +annotated pedestrian, i.e., 457K bounding boxes with groundtruth location +information; iii) the body-boxes are labeled as radius-aligned so as to fully +address the positioning challenge. To approach localization, we build a fisheye +person detection network, which exploits the fisheye distortions by a +rotation-equivariant training strategy and predict radius-aligned human boxes +end-to-end. Then, the actual locations of the detected persons are calculated +by a numerical solution on the fisheye model and camera altitude data. +Extensive experiments on LOAF validate the superiority of our fisheye detector +w.r.t. previous methods, and show that our whole fisheye positioning solution +is able to locate all persons in FOV with an accuracy of 0.5 m, within 0.1 s. + +
+
+ comment: ICCV 2023. Project page: https://LOAFisheye.github.io +
+
+
+
+
+ + ☆ Random Boxes Are Open-world Object Detectors ICCV 2023 + + +
+ We show that classifiers trained with random region proposals achieve +state-of-the-art Open-world Object Detection (OWOD): they can not only maintain +the accuracy of the known objects (w/ training labels), but also considerably +improve the recall of unknown ones (w/o training labels). Specifically, we +propose RandBox, a Fast R-CNN based architecture trained on random proposals at +each training iteration, surpassing existing Faster R-CNN and Transformer based +OWOD. Its effectiveness stems from the following two benefits introduced by +randomness. First, as the randomization is independent of the distribution of +the limited known objects, the random proposals become the instrumental +variable that prevents the training from being confounded by the known objects. +Second, the unbiased training encourages more proposal explorations by using +our proposed matching score that does not penalize the random proposals whose +prediction scores do not match the known objects. On two benchmarks: +Pascal-VOC/MS-COCO and LVIS, RandBox significantly outperforms the previous +state-of-the-art in all metrics. We also detail the ablations on randomization +and loss designs. Codes are available at https://github.com/scuwyh2000/RandBox. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Uncertainty-aware State Space Transformer for Egocentric 3D Hand + Trajectory Forecasting ICCV 2023 + + +
+ Hand trajectory forecasting from egocentric views is crucial for enabling a +prompt understanding of human intentions when interacting with AR/VR systems. +However, existing methods handle this problem in a 2D image space which is +inadequate for 3D real-world applications. In this paper, we set up an +egocentric 3D hand trajectory forecasting task that aims to predict hand +trajectories in a 3D space from early observed RGB videos in a first-person +view. To fulfill this goal, we propose an uncertainty-aware state space +Transformer (USST) that takes the merits of the attention mechanism and +aleatoric uncertainty within the framework of the classical state-space model. +The model can be further enhanced by the velocity constraint and visual prompt +tuning (VPT) on large vision transformers. Moreover, we develop an annotation +workflow to collect 3D hand trajectories with high quality. Experimental +results on H2O and EgoPAT3D datasets demonstrate the superiority of USST for +both 2D and 3D trajectory forecasting. The code and datasets are publicly +released: https://github.com/Cogito2012/USST. + +
+
+ comment: ICCV 2023 Accepted +
+
+
+
+
+ + ☆ Unified Open-Vocabulary Dense Visual Prediction + + +
+ In recent years, open-vocabulary (OV) dense visual prediction (such as OV +object detection, semantic, instance and panoptic segmentations) has attracted +increasing research attention. However, most of existing approaches are +task-specific and individually tackle each task. In this paper, we propose a +Unified Open-Vocabulary Network (UOVN) to jointly address four common dense +prediction tasks. Compared with separate models, a unified network is more +desirable for diverse industrial applications. Moreover, OV dense prediction +training data is relatively less. Separate networks can only leverage +task-relevant training data, while a unified approach can integrate diverse +training data to boost individual tasks. We address two major challenges in +unified OV prediction. Firstly, unlike unified methods for fixed-set +predictions, OV networks are usually trained with multi-modal data. Therefore, +we propose a multi-modal, multi-scale and multi-task (MMM) decoding mechanism +to better leverage multi-modal data. Secondly, because UOVN uses data from +different tasks for training, there are significant domain and task gaps. We +present a UOVN training mechanism to reduce such gaps. Experiments on four +datasets demonstrate the effectiveness of our UOVN. + +
+
+
+
+
+ + ☆ ROFusion: Efficient Object Detection using Hybrid Point-wise + Radar-Optical Fusion + + +
+ Radars, due to their robustness to adverse weather conditions and ability to +measure object motions, have served in autonomous driving and intelligent +agents for years. However, Radar-based perception suffers from its unintuitive +sensing data, which lack of semantic and structural information of scenes. To +tackle this problem, camera and Radar sensor fusion has been investigated as a +trending strategy with low cost, high reliability and strong maintenance. While +most recent works explore how to explore Radar point clouds and images, rich +contextual information within Radar observation are discarded. In this paper, +we propose a hybrid point-wise Radar-Optical fusion approach for object +detection in autonomous driving scenarios. The framework benefits from dense +contextual information from both the range-doppler spectrum and images which +are integrated to learn a multi-modal feature representation. Furthermore, we +propose a novel local coordinate formulation, tackling the object detection +task in an object-centric coordinate. Extensive results show that with the +information gained from optical images, we could achieve leading performance in +object detection (97.69\% recall) compared to recent state-of-the-art methods +FFT-RadNet (82.86\% recall). Ablation studies verify the key design choices and +practicability of our approach given machine generated imperfect detections. +The code will be available at https://github.com/LiuLiu-55/ROFusion. + +
+
+
+
+
+ + ☆ Video Frame Interpolation with Stereo Event and Intensity Camera + + +
+ The stereo event-intensity camera setup is widely applied to leverage the +advantages of both event cameras with low latency and intensity cameras that +capture accurate brightness and texture information. However, such a setup +commonly encounters cross-modality parallax that is difficult to be eliminated +solely with stereo rectification especially for real-world scenes with complex +motions and varying depths, posing artifacts and distortion for existing +Event-based Video Frame Interpolation (E-VFI) approaches. To tackle this +problem, we propose a novel Stereo Event-based VFI (SE-VFI) network (SEVFI-Net) +to generate high-quality intermediate frames and corresponding disparities from +misaligned inputs consisting of two consecutive keyframes and event streams +emitted between them. Specifically, we propose a Feature Aggregation Module +(FAM) to alleviate the parallax and achieve spatial alignment in the feature +domain. We then exploit the fused features accomplishing accurate optical flow +and disparity estimation, and achieving better interpolated results through +flow-based and synthesis-based ways. We also build a stereo visual acquisition +system composed of an event camera and an RGB-D camera to collect a new Stereo +Event-Intensity Dataset (SEID) containing diverse scenes with complex motions +and varying depths. Experiments on public real-world stereo datasets, i.e., +DSEC and MVSEC, and our SEID dataset demonstrate that our proposed SEVFI-Net +outperforms state-of-the-art methods by a large margin. + +
+
+
+
+
+ + ☆ Ada3D : Exploiting the Spatial Redundancy with Adaptive Inference for + Efficient 3D Object Detection ICCV2023 + + +
+ Voxel-based methods have achieved state-of-the-art performance for 3D object +detection in autonomous driving. However, their significant computational and +memory costs pose a challenge for their application to resource-constrained +vehicles. One reason for this high resource consumption is the presence of a +large number of redundant background points in Lidar point clouds, resulting in +spatial redundancy in both 3D voxel and dense BEV map representations. To +address this issue, we propose an adaptive inference framework called Ada3D, +which focuses on exploiting the input-level spatial redundancy. Ada3D +adaptively filters the redundant input, guided by a lightweight importance +predictor and the unique properties of the Lidar point cloud. Additionally, we +utilize the BEV features' intrinsic sparsity by introducing the Sparsity +Preserving Batch Normalization. With Ada3D, we achieve 40% reduction for 3D +voxels and decrease the density of 2D BEV feature maps from 100% to 20% without +sacrificing accuracy. Ada3D reduces the model computational and memory cost by +5x, and achieves 1.52x/1.45x end-to-end GPU latency and 1.5x/4.5x GPU peak +memory optimization for the 3D and 2D backbone respectively. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ☆ Manifold-Guided Sampling in Diffusion Models for Unbiased Image + Generation + + +
+ Diffusion models are a powerful class of generative models that can produce +high-quality images, but they may suffer from data bias. Data bias occurs when +the training data does not reflect the true distribution of the data domain, +but rather exhibits some skewed or imbalanced patterns. For example, the CelebA +dataset contains more female images than male images, which can lead to biased +generation results and affect downstream applications. In this paper, we +propose a novel method to mitigate data bias in diffusion models by applying +manifold guidance. Our key idea is to estimate the manifold of the training +data using a learnable information-theoretic approach, and then use it to guide +the sampling process of diffusion models. In this way, we can encourage the +generated images to be uniformly distributed on the data manifold, without +changing the model architecture or requiring labels or retraining. We provide +theoretical analysis and empirical evidence to show that our method can improve +the quality and unbiasedness of image generation compared to standard diffusion +models. + +
+
+
+
+
+ + ☆ On Point Affiliation in Feature Upsampling NeurIPS 2022 + + +
+ We introduce the notion of point affiliation into feature upsampling. By +abstracting a feature map into non-overlapped semantic clusters formed by +points of identical semantic meaning, feature upsampling can be viewed as point +affiliation -- designating a semantic cluster for each upsampled point. In the +framework of kernel-based dynamic upsampling, we show that an upsampled point +can resort to its low-res decoder neighbors and high-res encoder point to +reason the affiliation, conditioned on the mutual similarity between them. We +therefore present a generic formulation for generating similarity-aware +upsampling kernels and prove that such kernels encourage not only semantic +smoothness but also boundary sharpness. This formulation constitutes a novel, +lightweight, and universal upsampling solution, Similarity-Aware Point +Affiliation (SAPA). We show its working mechanism via our preliminary designs +with window-shape kernel. After probing the limitations of the designs on +object detection, we reveal additional insights for upsampling, leading to SAPA +with the dynamic kernel shape. Extensive experiments demonstrate that SAPA +outperforms prior upsamplers and invites consistent performance improvements on +a number of dense prediction tasks, including semantic segmentation, object +detection, instance segmentation, panoptic segmentation, image matting, and +depth estimation. Code is made available at: https://github.com/tiny-smart/sapa + +
+
+ comment: 17 pages. Extended version of NeurIPS 2022 paper "SAPA: + Similarity-Aware Point Affiliation for Feature Upsampling" at + arXiv:2209.12866v1. arXiv admin note: text overlap with arXiv:2209.12866 +
+
+
+
+
+ + ☆ Zero-Shot Image Harmonization with Generative Model Prior + + +
+ Recent image harmonization methods have demonstrated promising results. +However, due to their heavy reliance on a large number of composite images, +these works are expensive in the training phase and often fail to generalize to +unseen images. In this paper, we draw lessons from human behavior and come up +with a zero-shot image harmonization method. Specifically, in the harmonization +process, a human mainly utilizes his long-term prior on harmonious images and +makes a composite image close to that prior. To imitate that, we resort to +pretrained generative models for the prior of natural images. For the guidance +of the harmonization direction, we propose an Attention-Constraint Text which +is optimized to well illustrate the image environments. Some further designs +are introduced for preserving the foreground content structure. The resulting +framework, highly consistent with human behavior, can achieve harmonious +results without burdensome training. Extensive experiments have demonstrated +the effectiveness of our approach, and we have also explored some interesting +applications. + +
+
+ comment: Code Page: https://github.com/WindVChen/Diff-Harmonization +
+
+
+
+
+ + ☆ Modular Neural Network Approaches for Surgical Image Recognition + + +
+ Deep learning-based applications have seen a lot of success in recent years. +Text, audio, image, and video have all been explored with great success using +deep learning approaches. The use of convolutional neural networks (CNN) in +computer vision, in particular, has yielded reliable results. In order to +achieve these results, a large amount of data is required. However, the dataset +cannot always be accessible. Moreover, annotating data can be difficult and +time-consuming. Self-training is a semi-supervised approach that managed to +alleviate this problem and achieve state-of-the-art performances. Theoretical +analysis even proved that it may result in a better generalization than a +normal classifier. Another problem neural networks can face is the increasing +complexity of modern problems, requiring a high computational and storage cost. +One way to mitigate this issue, a strategy that has been inspired by human +cognition known as modular learning, can be employed. The principle of the +approach is to decompose a complex problem into simpler sub-tasks. This +approach has several advantages, including faster learning, better +generalization, and enables interpretability. + In the first part of this paper, we introduce and evaluate different +architectures of modular learning for Dorsal Capsulo-Scapholunate Septum (DCSS) +instability classification. Our experiments have shown that modular learning +improves performances compared to non-modular systems. Moreover, we found that +weighted modular, that is to weight the output using the probabilities from the +gating module, achieved an almost perfect classification. In the second part, +we present our approach for data labeling and segmentation with self-training +applied on shoulder arthroscopy images. + +
+
+
+
+
+ + ☆ LiDAR-BEVMTN: Real-Time LiDAR Bird's-Eye View Multi-Task Perception + Network for Autonomous Driving + + +
+ LiDAR is crucial for robust 3D scene perception in autonomous driving. LiDAR +perception has the largest body of literature after camera perception. However, +multi-task learning across tasks like detection, segmentation, and motion +estimation using LiDAR remains relatively unexplored, especially on +automotive-grade embedded platforms. We present a real-time multi-task +convolutional neural network for LiDAR-based object detection, semantics, and +motion segmentation. The unified architecture comprises a shared encoder and +task-specific decoders, enabling joint representation learning. We propose a +novel Semantic Weighting and Guidance (SWAG) module to transfer semantic +features for improved object detection selectively. Our heterogeneous training +scheme combines diverse datasets and exploits complementary cues between tasks. +The work provides the first embedded implementation unifying these key +perception tasks from LiDAR point clouds achieving 3ms latency on the embedded +NVIDIA Xavier platform. We achieve state-of-the-art results for two tasks, +semantic and motion segmentation, and close to state-of-the-art performance for +3D object detection. By maximizing hardware efficiency and leveraging +multi-task synergies, our method delivers an accurate and efficient solution +tailored for real-world automated driving deployment. Qualitative results can +be seen at https://youtu.be/H-hWRzv2lIY. + +
+
+
+
+
+ + ☆ DARTS: Double Attention Reference-based Transformer for Super-resolution + + +
+ We present DARTS, a transformer model for reference-based image +super-resolution. DARTS learns joint representations of two image distributions +to enhance the content of low-resolution input images through matching +correspondences learned from high-resolution reference images. Current +state-of-the-art techniques in reference-based image super-resolution are based +on a multi-network, multi-stage architecture. In this work, we adapt the double +attention block from the GAN literature, processing the two visual streams +separately and combining self-attention and cross-attention blocks through a +gating attention strategy. Our work demonstrates how the attention mechanism +can be adapted for the particular requirements of reference-based image +super-resolution, significantly simplifying the architecture and training +pipeline. We show that our transformer-based model performs competitively with +state-of-the-art models, while maintaining a simpler overall architecture and +training process. In particular, we obtain state-of-the-art on the SUN80 +dataset, with a PSNR/SSIM of 29.83 / .809. These results show that attention +alone is sufficient for the RSR task, without multiple purpose-built +subnetworks, knowledge distillation, or multi-stage training. + +
+
+
+
+
+ + ☆ Local or Global: Selective Knowledge Assimilation for Federated Learning + with Limited Labels ICCV 2023 + + +
+ Many existing FL methods assume clients with fully-labeled data, while in +realistic settings, clients have limited labels due to the expensive and +laborious process of labeling. Limited labeled local data of the clients often +leads to their local model having poor generalization abilities to their larger +unlabeled local data, such as having class-distribution mismatch with the +unlabeled data. As a result, clients may instead look to benefit from the +global model trained across clients to leverage their unlabeled data, but this +also becomes difficult due to data heterogeneity across clients. In our work, +we propose FedLabel where clients selectively choose the local or global model +to pseudo-label their unlabeled data depending on which is more of an expert of +the data. We further utilize both the local and global models' knowledge via +global-local consistency regularization which minimizes the divergence between +the two models' outputs when they have identical pseudo-labels for the +unlabeled data. Unlike other semi-supervised FL baselines, our method does not +require additional experts other than the local or global model, nor require +additional parameters to be communicated. We also do not assume any +server-labeled data or fully labeled clients. For both cross-device and +cross-silo settings, we show that FedLabel outperforms other semi-supervised FL +baselines by $8$-$24\%$, and even outperforms standard fully supervised FL +baselines ($100\%$ labeled data) with only $5$-$20\%$ of labeled data. + +
+
+ comment: To appear in the proceedings of ICCV 2023 +
+
+
+
+
+ + ☆ Harnessing the Power of AI based Image Generation Model DALLE 2 in + Agricultural Settings + + +
+ This study investigates the potential impact of artificial intelligence (AI) +on the enhancement of visualization processes in the agricultural sector, using +the advanced AI image generator, DALLE 2, developed by OpenAI. By +synergistically utilizing the natural language processing proficiency of +chatGPT and the generative prowess of the DALLE 2 model, which employs a +Generative Adversarial Networks (GANs) framework, our research offers an +innovative method to transform textual descriptors into realistic visual +content. Our rigorously assembled datasets include a broad spectrum of +agricultural elements such as fruits, plants, and scenarios differentiating +crops from weeds, maintained for AI-generated versus original images. The +quality and accuracy of the AI-generated images were evaluated via established +metrics including mean squared error (MSE), peak signal-to-noise ratio (PSNR), +and feature similarity index (FSIM). The results underline the significant role +of the DALLE 2 model in enhancing visualization processes in agriculture, +aiding in more informed decision-making, and improving resource distribution. +The outcomes of this research highlight the imminent rise of an AI-led +transformation in the realm of precision agriculture. + +
+
+ comment: 22 pages, 13 figures, 2 tables +
+
+
+
+
+ + ☆ The FathomNet2023 Competition Dataset + + +
+ Ocean scientists have been collecting visual data to study marine organisms +for decades. These images and videos are extremely valuable both for basic +science and environmental monitoring tasks. There are tools for automatically +processing these data, but none that are capable of handling the extreme +variability in sample populations, image quality, and habitat characteristics +that are common in visual sampling of the ocean. Such distribution shifts can +occur over very short physical distances and in narrow time windows. Creating +models that are able to recognize when an image or video sequence contains a +new organism, an unusual collection of animals, or is otherwise out-of-sample +is critical to fully leverage visual data in the ocean. The FathomNet2023 +competition dataset presents a realistic scenario where the set of animals in +the target data differs from the training data. The challenge is both to +identify the organisms in a target image and assess whether it is +out-of-sample. + +
+
+ comment: Competition was presented as part of the 10th Fine Grained Visual + Categorization workshop at the 2023 Computer Vision and Pattern Recognition + conference. 4 pages, 4 figures +
+
+
+
+
+ + ☆ Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation ICCV 2023 + + +
+ Low-light conditions not only hamper human visual experience but also degrade +the model's performance on downstream vision tasks. While existing works make +remarkable progress on day-night domain adaptation, they rely heavily on domain +knowledge derived from the task-specific nighttime dataset. This paper +challenges a more complicated scenario with border applicability, i.e., +zero-shot day-night domain adaptation, which eliminates reliance on any +nighttime data. Unlike prior zero-shot adaptation approaches emphasizing either +image-level translation or model-level adaptation, we propose a similarity +min-max paradigm that considers them under a unified framework. On the image +level, we darken images towards minimum feature similarity to enlarge the +domain gap. Then on the model level, we maximize the feature similarity between +the darkened images and their normal-light counterparts for better model +adaptation. To the best of our knowledge, this work represents the pioneering +effort in jointly optimizing both aspects, resulting in a significant +improvement of model generalizability. Extensive experiments demonstrate our +method's effectiveness and broad applicability on various nighttime vision +tasks, including classification, semantic segmentation, visual place +recognition, and video action recognition. Code and pre-trained models are +available at https://red-fairy.github.io/ZeroShotDayNightDA-Webpage/. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ UPSCALE: Unconstrained Channel Pruning ICML 2023 + + +
+ As neural networks grow in size and complexity, inference speeds decline. To +combat this, one of the most effective compression techniques -- channel +pruning -- removes channels from weights. However, for multi-branch segments of +a model, channel removal can introduce inference-time memory copies. In turn, +these copies increase inference latency -- so much so that the pruned model can +be slower than the unpruned model. As a workaround, pruners conventionally +constrain certain channels to be pruned together. This fully eliminates memory +copies but, as we show, significantly impairs accuracy. We now have a dilemma: +Remove constraints but increase latency, or add constraints and impair +accuracy. In response, our insight is to reorder channels at export time, (1) +reducing latency by reducing memory copies and (2) improving accuracy by +removing constraints. Using this insight, we design a generic algorithm UPSCALE +to prune models with any pruning pattern. By removing constraints from existing +pruners, we improve ImageNet accuracy for post-training pruned models by 2.1 +points on average -- benefiting DenseNet (+16.9), EfficientNetV2 (+7.9), and +ResNet (+6.2). Furthermore, by reordering channels, UPSCALE improves inference +speeds by up to 2x over a baseline export. + +
+
+ comment: 29 pages, 26 figures, accepted to ICML 2023 +
+
+
+
+
+ + ☆ Video-Mined Task Graphs for Keystep Recognition in Instructional Videos + + +
+ Procedural activity understanding requires perceiving human actions in terms +of a broader task, where multiple keysteps are performed in sequence across a +long video to reach a final goal state -- such as the steps of a recipe or a +DIY fix-it task. Prior work largely treats keystep recognition in isolation of +this broader structure, or else rigidly confines keysteps to align with a +predefined sequential script. We propose discovering a task graph automatically +from how-to videos to represent probabilistically how people tend to execute +keysteps, and then leverage this graph to regularize keystep recognition in +novel videos. On multiple datasets of real-world instructional videos, we show +the impact: more reliable zero-shot keystep localization and improved video +representation learning, exceeding the state of the art. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Semantic Counting from Self-Collages + + +
+ While recent supervised methods for reference-based object counting continue +to improve the performance on benchmark datasets, they have to rely on small +datasets due to the cost associated with manually annotating dozens of objects +in images. We propose Unsupervised Counter (UnCo), a model that can learn this +task without requiring any manual annotations. To this end, we construct +"SelfCollages", images with various pasted objects as training samples, that +provide a rich learning signal covering arbitrary object types and counts. Our +method builds on existing unsupervised representations and segmentation +techniques to successfully demonstrate the ability to count objects without +manual supervision. Our experiments show that our method not only outperforms +simple baselines and generic models such as FasterRCNN, but also matches the +performance of supervised counting models in some domains. + +
+
+ comment: 24 pages. Code available at + https://github.com/lukasknobel/SelfCollages +
+
+
+
+
+ + ♻ ☆ HumanMAC: Masked Motion Completion for Human Motion Prediction + + +
+ Human motion prediction is a classical problem in computer vision and +computer graphics, which has a wide range of practical applications. Previous +effects achieve great empirical performance based on an encoding-decoding +style. The methods of this style work by first encoding previous motions to +latent representations and then decoding the latent representations into +predicted motions. However, in practice, they are still unsatisfactory due to +several issues, including complicated loss constraints, cumbersome training +processes, and scarce switch of different categories of motions in prediction. +In this paper, to address the above issues, we jump out of the foregoing style +and propose a novel framework from a new perspective. Specifically, our +framework works in a masked completion fashion. In the training stage, we learn +a motion diffusion model that generates motions from random noise. In the +inference stage, with a denoising procedure, we make motion prediction +conditioning on observed motions to output more continuous and controllable +predictions. The proposed framework enjoys promising algorithmic properties, +which only needs one loss in optimization and is trained in an end-to-end +manner. Additionally, it accomplishes the switch of different categories of +motions effectively, which is significant in realistic tasks, e.g., the +animation task. Comprehensive experiments on benchmarks confirm the superiority +of the proposed framework. The project page is available at +https://lhchen.top/Human-MAC. + +
+
+
+
+
+ + ♻ ☆ LDMVFI: Video Frame Interpolation with Latent Diffusion Models + + +
+ Existing works on video frame interpolation (VFI) mostly employ deep neural +networks trained to minimize the L1 or L2 distance between their outputs and +ground-truth frames. Despite recent advances, existing VFI methods tend to +produce perceptually inferior results, particularly for challenging scenarios +including large motions and dynamic textures. Towards developing +perceptually-oriented VFI methods, we propose latent diffusion model-based VFI, +LDMVFI. This approaches the VFI problem from a generative perspective by +formulating it as a conditional generation problem. As the first effort to +address VFI using latent diffusion models, we rigorously benchmark our method +following the common evaluation protocol adopted in the existing VFI +literature. Our quantitative experiments and user study indicate that LDMVFI is +able to interpolate video content with superior perceptual quality compared to +the state of the art, even in the high-resolution regime. Our source code will +be made available here. + +
+
+
+
+
+ + ♻ ☆ Establishing a stronger baseline for lightweight contrastive models ICME 2023 + + +
+ Recent research has reported a performance degradation in self-supervised +contrastive learning for specially designed efficient networks, such as +MobileNet and EfficientNet. A common practice to address this problem is to +introduce a pretrained contrastive teacher model and train the lightweight +networks with distillation signals generated by the teacher. However, it is +time and resource consuming to pretrain a teacher model when it is not +available. In this work, we aim to establish a stronger baseline for +lightweight contrastive models without using a pretrained teacher model. +Specifically, we show that the optimal recipe for efficient models is different +from that of larger models, and using the same training settings as ResNet50, +as previous research does, is inappropriate. Additionally, we observe a common +issu e in contrastive learning where either the positive or negative views can +be noisy, and propose a smoothed version of InfoNCE loss to alleviate this +problem. As a result, we successfully improve the linear evaluation results +from 36.3\% to 62.3\% for MobileNet-V3-Large and from 42.2\% to 65.8\% for +EfficientNet-B0 on ImageNet, closing the accuracy gap to ResNet50 with +$5\times$ fewer parameters. We hope our research will facilitate the usage of +lightweight contrastive models. + +
+
+ comment: ICME 2023 oral +
+
+
+
+
+ + ♻ ☆ Knowledge Boosting: Rethinking Medical Contrastive Vision-Language + Pre-Training MICCAI 2023 + + +
+ The foundation models based on pre-training technology have significantly +advanced artificial intelligence from theoretical to practical applications. +These models have facilitated the feasibility of computer-aided diagnosis for +widespread use. Medical contrastive vision-language pre-training, which does +not require human annotations, is an effective approach for guiding +representation learning using description information in diagnostic reports. +However, the effectiveness of pre-training is limited by the large-scale +semantic overlap and shifting problems in medical field. To address these +issues, we propose the Knowledge-Boosting Contrastive Vision-Language +Pre-training framework (KoBo), which integrates clinical knowledge into the +learning of vision-language semantic consistency. The framework uses an +unbiased, open-set sample-wise knowledge representation to measure negative +sample noise and supplement the correspondence between vision-language mutual +information and clinical knowledge. Extensive experiments validate the effect +of our framework on eight tasks including classification, segmentation, +retrieval, and semantic relatedness, achieving comparable or better performance +with the zero-shot or few-shot settings. Our code is open on +https://github.com/ChenXiaoFei-CS/KoBo. + +
+
+ comment: accepted by MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ ENInst: Enhancing Weakly-supervised Low-shot Instance Segmentation + + +
+ We address a weakly-supervised low-shot instance segmentation, an +annotation-efficient training method to deal with novel classes effectively. +Since it is an under-explored problem, we first investigate the difficulty of +the problem and identify the performance bottleneck by conducting systematic +analyses of model components and individual sub-tasks with a simple baseline +model. Based on the analyses, we propose ENInst with sub-task enhancement +methods: instance-wise mask refinement for enhancing pixel localization quality +and novel classifier composition for improving classification accuracy. Our +proposed method lifts the overall performance by enhancing the performance of +each sub-task. We demonstrate that our ENInst is 7.5 times more efficient in +achieving comparable performance to the existing fully-supervised few-shot +models and even outperforms them at times. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ ContrasInver: Ultra-Sparse Label Semi-supervised Regression for + Multi-dimensional Seismic Inversion + + +
+ The automated interpretation and inversion of seismic data have advanced +significantly with the development of Deep Learning (DL) methods. However, +these methods often require numerous costly well logs, limiting their +application only to mature or synthetic data. This paper presents ContrasInver, +a method that achieves seismic inversion using as few as two or three well +logs, significantly reducing current requirements. In ContrasInver, we propose +three key innovations to address the challenges of applying semi-supervised +learning to regression tasks with ultra-sparse labels. The Multi-dimensional +Sample Generation (MSG) technique pioneers a paradigm for sample generation in +multi-dimensional inversion. It produces a large number of diverse samples from +a single well, while establishing lateral continuity in seismic data. MSG +yields substantial improvements over current techniques, even without the use +of semi-supervised learning. The Region-Growing Training (RGT) strategy +leverages the inherent continuity of seismic data, effectively propagating +accuracy from closer to more distant regions based on the proximity of well +logs. The Impedance Vectorization Projection (IVP) vectorizes impedance values +and performs semi-supervised learning in a compressed space. We demonstrated +that the Jacobian matrix derived from this space can filter out some outlier +components in pseudo-label vectors, thereby solving the value confusion issue +in semi-supervised regression learning. In the experiments, ContrasInver +achieved state-of-the-art performance in the synthetic data SEAM I. In the +field data with two or three well logs, only the methods based on the +components proposed in this paper were able to achieve reasonable results. It's +the first data-driven approach yielding reliable results on the Netherlands F3 +and Delft, using only three and two well logs respectively. + +
+
+ comment: This work has been submitted to journal for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ From Knowledge Distillation to Self-Knowledge Distillation: A Unified + Approach with Normalized Loss and Customized Soft Labels ICCV 2023 + + +
+ Knowledge Distillation (KD) uses the teacher's prediction logits as soft +labels to guide the student, while self-KD does not need a real teacher to +require the soft labels. This work unifies the formulations of the two tasks by +decomposing and reorganizing the generic KD loss into a Normalized KD (NKD) +loss and customized soft labels for both target class (image's category) and +non-target classes named Universal Self-Knowledge Distillation (USKD). We +decompose the KD loss and find the non-target loss from it forces the student's +non-target logits to match the teacher's, but the sum of the two non-target +logits is different, preventing them from being identical. NKD normalizes the +non-target logits to equalize their sum. It can be generally used for KD and +self-KD to better use the soft labels for distillation loss. USKD generates +customized soft labels for both target and non-target classes without a +teacher. It smooths the target logit of the student as the soft target label +and uses the rank of the intermediate feature to generate the soft non-target +labels with Zipf's law. For KD with teachers, our NKD achieves state-of-the-art +performance on CIFAR-100 and ImageNet datasets, boosting the ImageNet Top-1 +accuracy of ResNet18 from 69.90% to 71.96% with a ResNet-34 teacher. For +self-KD without teachers, USKD is the first self-KD method that can be +effectively applied to both CNN and ViT models with negligible additional time +and memory cost, resulting in new state-of-the-art results, such as 1.17% and +0.55% accuracy gains on ImageNet for MobileNet and DeiT-Tiny, respectively. Our +codes are available at https://github.com/yzd-v/cls_KD. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Fair Diffusion: Instructing Text-to-Image Generation Models on Fairness + + +
+ Generative AI models have recently achieved astonishing results in quality +and are consequently employed in a fast-growing number of applications. +However, since they are highly data-driven, relying on billion-sized datasets +randomly scraped from the internet, they also suffer from degenerated and +biased human behavior, as we demonstrate. In fact, they may even reinforce such +biases. To not only uncover but also combat these undesired effects, we present +a novel strategy, called Fair Diffusion, to attenuate biases after the +deployment of generative text-to-image models. Specifically, we demonstrate +shifting a bias, based on human instructions, in any direction yielding +arbitrarily new proportions for, e.g., identity groups. As our empirical +evaluation demonstrates, this introduced control enables instructing generative +image models on fairness, with no data filtering and additional training +required. + +
+
+
+
+
+ + ♻ ☆ Action-based Early Autism Diagnosis Using Contrastive Feature Learning + + +
+ Autism, also known as Autism Spectrum Disorder (or ASD), is a neurological +disorder. Its main symptoms include difficulty in (verbal and/or non-verbal) +communication, and rigid/repetitive behavior. These symptoms are often +indistinguishable from a normal (control) individual, due to which this +disorder remains undiagnosed in early childhood leading to delayed treatment. +Since the learning curve is steep during the initial age, an early diagnosis of +autism could allow to take adequate interventions at the right time, which +might positively affect the growth of an autistic child. Further, the +traditional methods of autism diagnosis require multiple visits to a +specialized psychiatrist, however this process can be time-consuming. In this +paper, we present a learning based approach to automate autism diagnosis using +simple and small action video clips of subjects. This task is particularly +challenging because the amount of annotated data available is small, and the +variations among samples from the two categories (ASD and control) are +generally indistinguishable. This is also evident from poor performance of a +binary classifier learned using the cross-entropy loss on top of a baseline +encoder. To address this, we adopt contrastive feature learning in both self +supervised and supervised learning frameworks, and show that these can lead to +a significant increase in the prediction accuracy of a binary classifier on +this task. We further validate this by conducting thorough experimental +analyses under different set-ups on two publicly available datasets. + +
+
+ comment: This preprint has not undergone peer review (when applicable) or any + postsubmission improvements or corrections. The Version of Record of this + article is published in Multimedia Systems (2023), and is available online at + https://doi.org/10.1007/s00530-023-01132-8 +
+
+
+
+
+ + ♻ ☆ Target-driven One-Shot Unsupervised Domain Adaptation + + +
+ In this paper, we introduce a novel framework for the challenging problem of +One-Shot Unsupervised Domain Adaptation (OSUDA), which aims to adapt to a +target domain with only a single unlabeled target sample. Unlike existing +approaches that rely on large labeled source and unlabeled target data, our +Target-driven One-Shot UDA (TOS-UDA) approach employs a learnable augmentation +strategy guided by the target sample's style to align the source distribution +with the target distribution. Our method consists of three modules: an +augmentation module, a style alignment module, and a classifier. Unlike +existing methods, our augmentation module allows for strong transformations of +the source samples, and the style of the single target sample available is +exploited to guide the augmentation by ensuring perceptual similarity. +Furthermore, our approach integrates augmentation with style alignment, +eliminating the need for separate pre-training on additional datasets. Our +method outperforms or performs comparably to existing OS-UDA methods on the +Digits and DomainNet benchmarks. + +
+
+ comment: Accepted to 22nd International Conference on IMAGE ANALYSIS AND + PROCESSING (ICIAP) 2023 +
+
+
+
+
+ + ♻ ☆ Aberration-Aware Depth-from-Focus + + +
+ Computer vision methods for depth estimation usually use simple camera models +with idealized optics. For modern machine learning approaches, this creates an +issue when attempting to train deep networks with simulated data, especially +for focus-sensitive tasks like Depth-from-Focus. In this work, we investigate +the domain gap caused by off-axis aberrations that will affect the decision of +the best-focused frame in a focal stack. We then explore bridging this domain +gap through aberration-aware training (AAT). Our approach involves a +lightweight network that models lens aberrations at different positions and +focus distances, which is then integrated into the conventional network +training pipeline. We evaluate the generality of pretrained models on both +synthetic and real-world data. Our experimental results demonstrate that the +proposed AAT scheme can improve depth estimation accuracy without fine-tuning +the model or modifying the network architecture. + +
+
+ comment: [ICCP & TPAMI 2023] Considering optical aberrations during network + training can improve the generalizability +
+
+
+
+
+ + ♻ ☆ CeRF: Convolutional Neural Radiance Fields for New View Synthesis with + Derivatives of Ray Modeling + + +
+ In recent years, novel view synthesis has gained popularity in generating +high-fidelity images. While demonstrating superior performance in the task of +synthesizing novel views, the majority of these methods are still based on the +conventional multi-layer perceptron for scene embedding. Furthermore, light +field models suffer from geometric blurring during pixel rendering, while +radiance field-based volume rendering methods have multiple solutions for a +certain target of density distribution integration. To address these issues, we +introduce the Convolutional Neural Radiance Fields to model the derivatives of +radiance along rays. Based on 1D convolutional operations, our proposed method +effectively extracts potential ray representations through a structured neural +network architecture. Besides, with the proposed ray modeling, a proposed +recurrent module is employed to solve geometric ambiguity in the fully neural +rendering process. Extensive experiments demonstrate the promising results of +our proposed model compared with existing state-of-the-art methods. + +
+
+ comment: 16 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ 3D-aware Blending with Generative NeRFs ICCV 2023 + + +
+ Image blending aims to combine multiple images seamlessly. It remains +challenging for existing 2D-based methods, especially when input images are +misaligned due to differences in 3D camera poses and object shapes. To tackle +these issues, we propose a 3D-aware blending method using generative Neural +Radiance Fields (NeRF), including two key components: 3D-aware alignment and +3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of +the reference image with respect to generative NeRFs and then perform 3D local +alignment for each part. To further leverage 3D information of the generative +NeRF, we propose 3D-aware blending that directly blends images on the NeRF's +latent representation space, rather than raw pixel space. Collectively, our +method outperforms existing 2D baselines, as validated by extensive +quantitative and qualitative evaluations with FFHQ and AFHQ-Cat. + +
+
+ comment: ICCV 2023, Project page: https://blandocs.github.io/blendnerf +
+
+
+
+
+ + ♻ ☆ SuS-X: Training-Free Name-Only Transfer of Vision-Language Models ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet +effective way to train large-scale vision-language models. CLIP demonstrates +impressive zero-shot classification and retrieval on diverse downstream tasks. +However, to leverage its full potential, fine-tuning still appears to be +necessary. Fine-tuning the entire CLIP model can be resource-intensive and +unstable. Moreover, recent methods that aim to circumvent this need for +fine-tuning still require access to images from the target distribution. In +this paper, we pursue a different approach and explore the regime of +training-free "name-only transfer" in which the only knowledge we possess about +the downstream task comprises the names of downstream target categories. We +propose a novel method, SuS-X, consisting of two key building blocks -- SuS and +TIP-X, that requires neither intensive fine-tuning nor costly labelled data. +SuS-X achieves state-of-the-art zero-shot classification results on 19 +benchmark datasets. We further show the utility of TIP-X in the training-free +few-shot setting, where we again achieve state-of-the-art results over strong +training-free baselines. Code is available at +https://github.com/vishaal27/SuS-X. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Crowd Density Estimation using Imperfect Labels + + +
+ Density estimation is one of the most widely used methods for crowd counting +in which a deep learning model learns from head-annotated crowd images to +estimate crowd density in unseen images. Typically, the learning performance of +the model is highly impacted by the accuracy of the annotations and inaccurate +annotations may lead to localization and counting errors during prediction. A +significant amount of works exist on crowd counting using perfectly labelled +datasets but none of these explore the impact of annotation errors on the model +accuracy. In this paper, we investigate the impact of imperfect labels (both +noisy and missing labels) on crowd counting accuracy. We propose a system that +automatically generates imperfect labels using a deep learning model (called +annotator) which are then used to train a new crowd counting model (target +model). Our analysis on two crowd counting models and two benchmark datasets +shows that the proposed scheme achieves accuracy closer to that of the model +trained with perfect labels showing the robustness of crowd models to +annotation errors. + +
+
+ comment: This paper has been accepted for presentation in 41st IEEE + International Conference on Consumer Electronics (ICCE 2023), 6-8 January, + 2023, Las Vegas, USA +
+
+
+
+
+ + ♻ ☆ Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for + Visible-Infrared Video Person Re-Identification + + +
+ In visible-infrared video person re-identification (re-ID), extracting +features not affected by complex scenes (such as modality, camera views, +pedestrian pose, background, etc.) changes, and mining and utilizing motion +information are the keys to solving cross-modal pedestrian identity matching. +To this end, the paper proposes a new visible-infrared video person re-ID +method from a novel perspective, i.e., adversarial self-attack defense and +spatial-temporal relation mining. In this work, the changes of views, posture, +background and modal discrepancy are considered as the main factors that cause +the perturbations of person identity features. Such interference information +contained in the training samples is used as an adversarial perturbation. It +performs adversarial attacks on the re-ID model during the training to make the +model more robust to these unfavorable factors. The attack from the adversarial +perturbation is introduced by activating the interference information contained +in the input samples without generating adversarial samples, and it can be thus +called adversarial self-attack. This design allows adversarial attack and +defense to be integrated into one framework. This paper further proposes a +spatial-temporal information-guided feature representation network to use the +information in video sequences. The network cannot only extract the information +contained in the video-frame sequences but also use the relation of the local +information in space to guide the network to extract more robust features. The +proposed method exhibits compelling performance on large-scale cross-modality +video datasets. The source code of the proposed method will be released at +https://github.com/lhf12278/xxx. + +
+
+ comment: 11 pages,8 figures +
+
+
+
+
+ + ♻ ☆ CLIP: Train Faster with Less Data + + +
+ Deep learning models require an enormous amount of data for training. +However, recently there is a shift in machine learning from model-centric to +data-centric approaches. In data-centric approaches, the focus is to refine and +improve the quality of the data to improve the learning performance of the +models rather than redesigning model architectures. In this paper, we propose +CLIP i.e., Curriculum Learning with Iterative data Pruning. CLIP combines two +data-centric approaches i.e., curriculum learning and dataset pruning to +improve the model learning accuracy and convergence speed. The proposed scheme +applies loss-aware dataset pruning to iteratively remove the least significant +samples and progressively reduces the size of the effective dataset in the +curriculum learning training. Extensive experiments performed on crowd density +estimation models validate the notion behind combining the two approaches by +reducing the convergence time and improving generalization. To our knowledge, +the idea of data pruning as an embedded process in curriculum learning is +novel. + +
+
+ comment: This paper has been accepted for presentation in 2023 International + Conference on Big Data and Smart Computing, February 13-16, 2023, Jeju, Korea +
+
+
+
+
+ + ♻ ☆ StitchNet: Composing Neural Networks from Pre-Trained Fragments + + +
+ We propose StitchNet, a novel neural network creation paradigm that stitches +together fragments (one or more consecutive network layers) from multiple +pre-trained neural networks. StitchNet allows the creation of high-performing +neural networks without the large compute and data requirements needed under +traditional model creation processes via backpropagation training. We leverage +Centered Kernel Alignment (CKA) as a compatibility measure to efficiently guide +the selection of these fragments in composing a network for a given task +tailored to specific accuracy needs and computing resource constraints. We then +show that these fragments can be stitched together to create neural networks +with comparable accuracy to traditionally trained networks at a fraction of +computing resource and data requirements. Finally, we explore a novel +on-the-fly personalized model creation and inference application enabled by +this new paradigm. + +
+
+
+
+
+ + ♻ ☆ DroneNet: Crowd Density Estimation using Self-ONNs for Drones + + +
+ Video surveillance using drones is both convenient and efficient due to the +ease of deployment and unobstructed movement of drones in many scenarios. An +interesting application of drone-based video surveillance is to estimate crowd +densities (both pedestrians and vehicles) in public places. Deep learning using +convolution neural networks (CNNs) is employed for automatic crowd counting and +density estimation using images and videos. However, the performance and +accuracy of such models typically depend upon the model architecture i.e., +deeper CNN models improve accuracy at the cost of increased inference time. In +this paper, we propose a novel crowd density estimation model for drones +(DroneNet) using Self-organized Operational Neural Networks (Self-ONN). +Self-ONN provides efficient learning capabilities with lower computational +complexity as compared to CNN-based models. We tested our algorithm on two +drone-view public datasets. Our evaluation shows that the proposed DroneNet +shows superior performance on an equivalent CNN-based model. + +
+
+ comment: The paper has been accepted for presentation in 2023 IEEE Consumer + Communications & Networking Conference (CCNC) +
+
+
+
+
+ + ♻ ☆ Rad-ReStruct: A Novel VQA Benchmark and Method for Structured Radiology + Reporting MICCAI 2023 + + +
+ Radiology reporting is a crucial part of the communication between +radiologists and other medical professionals, but it can be time-consuming and +error-prone. One approach to alleviate this is structured reporting, which +saves time and enables a more accurate evaluation than free-text reports. +However, there is limited research on automating structured reporting, and no +public benchmark is available for evaluating and comparing different methods. +To close this gap, we introduce Rad-ReStruct, a new benchmark dataset that +provides fine-grained, hierarchically ordered annotations in the form of +structured reports for X-Ray images. We model the structured reporting task as +hierarchical visual question answering (VQA) and propose hi-VQA, a novel method +that considers prior context in the form of previously asked questions and +answers for populating a structured radiology report. Our experiments show that +hi-VQA achieves competitive performance to the state-of-the-art on the medical +VQA benchmark VQARad while performing best among methods without +domain-specific vision-language pretraining and provides a strong baseline on +Rad-ReStruct. Our work represents a significant step towards the automated +population of structured radiology reports and provides a valuable first +benchmark for future research in this area. We will make all annotations and +our code for annotation generation, model evaluation, and training publicly +available upon acceptance. Our dataset and code is available at +https://github.com/ChantalMP/Rad-ReStruct. + +
+
+ comment: provisionally accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Cross-Modal Retrieval for Motion and Text via MildTriple Loss + + +
+ Cross-modal retrieval has become a prominent research topic in computer +vision and natural language processing with advances made in image-text and +video-text retrieval technologies. However, cross-modal retrieval between human +motion sequences and text has not garnered sufficient attention despite the +extensive application value it holds, such as aiding virtual reality +applications in better understanding users' actions and language. This task +presents several challenges, including joint modeling of the two modalities, +demanding the understanding of person-centered information from text, and +learning behavior features from 3D human motion sequences. Previous work on +motion data modeling mainly relied on autoregressive feature extractors that +may forget previous information, while we propose an innovative model that +includes simple yet powerful transformer-based motion and text encoders, which +can learn representations from the two different modalities and capture +long-term dependencies. Furthermore, the overlap of the same atomic actions of +different human motions can cause semantic conflicts, leading us to explore a +new triplet loss function, MildTriple Loss. it leverages the similarity between +samples in intra-modal space to guide soft-hard negative sample mining in the +joint embedding space to train the triplet loss and reduce the violation caused +by false negative samples. We evaluated our model and method on the latest +HumanML3D and KIT Motion-Language datasets, achieving a 62.9\% recall for +motion retrieval and a 71.5\% recall for text retrieval (based on R@10) on the +HumanML3D dataset. Our code is available at +https://github.com/eanson023/rehamot. + +
+
+ comment: This research was rejected by the submitted journal and needs to be + revised before submitting +
+
+
+
+
+ + ♻ ☆ Human in the AI loop via xAI and Active Learning for Visual Inspection + + +
+ Industrial revolutions have historically disrupted manufacturing by +introducing automation into production. Increasing automation reshapes the role +of the human worker. Advances in robotics and artificial intelligence open new +frontiers of human-machine collaboration. Such collaboration can be realized +considering two sub-fields of artificial intelligence: active learning and +explainable artificial intelligence. Active learning aims to devise strategies +that help obtain data that allows machine learning algorithms to learn better. +On the other hand, explainable artificial intelligence aims to make the machine +learning models intelligible to the human person. The present work first +describes Industry 5.0, human-machine collaboration, and state-of-the-art +regarding quality inspection, emphasizing visual inspection. Then it outlines +how human-machine collaboration could be realized and enhanced in visual +inspection. Finally, some of the results obtained in the EU H2020 STAR project +regarding visual inspection are shared, considering artificial intelligence, +human digital twins, and cybersecurity. + +
+
+
+
+
+ + ♻ ☆ MKConv: Multidimensional Feature Representation for Point Cloud Analysis + + +
+ Despite the remarkable success of deep learning, an optimal convolution +operation on point clouds remains elusive owing to their irregular data +structure. Existing methods mainly focus on designing an effective continuous +kernel function that can handle an arbitrary point in continuous space. Various +approaches exhibiting high performance have been proposed, but we observe that +the standard pointwise feature is represented by 1D channels and can become +more informative when its representation involves additional spatial feature +dimensions. In this paper, we present Multidimensional Kernel Convolution +(MKConv), a novel convolution operator that learns to transform the point +feature representation from a vector to a multidimensional matrix. Unlike +standard point convolution, MKConv proceeds via two steps. (i) It first +activates the spatial dimensions of local feature representation by exploiting +multidimensional kernel weights. These spatially expanded features can +represent their embedded information through spatial correlation as well as +channel correlation in feature space, carrying more detailed local structure +information. (ii) Then, discrete convolutions are applied to the +multidimensional features which can be regarded as a grid-structured matrix. In +this way, we can utilize the discrete convolutions for point cloud data without +voxelization that suffers from information loss. Furthermore, we propose a +spatial attention module, Multidimensional Local Attention (MLA), to provide +comprehensive structure awareness within the local point set by reweighting the +spatial feature dimensions. We demonstrate that MKConv has excellent +applicability to point cloud processing tasks including object classification, +object part segmentation, and scene semantic segmentation with superior +results. + +
+
+ comment: Accepted by Pattern Recognition 2023 +
+
+
+
+
+ + ♻ ☆ Ref-NeuS: Ambiguity-Reduced Neural Implicit Surface Learning for + Multi-View Reconstruction with Reflection ICCV 2023 + + +
+ Neural implicit surface learning has shown significant progress in multi-view +3D reconstruction, where an object is represented by multilayer perceptrons +that provide continuous implicit surface representation and view-dependent +radiance. However, current methods often fail to accurately reconstruct +reflective surfaces, leading to severe ambiguity. To overcome this issue, we +propose Ref-NeuS, which aims to reduce ambiguity by attenuating the effect of +reflective surfaces. Specifically, we utilize an anomaly detector to estimate +an explicit reflection score with the guidance of multi-view context to +localize reflective surfaces. Afterward, we design a reflection-aware +photometric loss that adaptively reduces ambiguity by modeling rendered color +as a Gaussian distribution, with the reflection score representing the +variance. We show that together with a reflection direction-dependent radiance, +our model achieves high-quality surface reconstruction on reflective surfaces +and outperforms the state-of-the-arts by a large margin. Besides, our model is +also comparable on general surfaces. + +
+
+ comment: ICCV 2023, Project webpage: https://g3956.github.io/ +
+
+
+
+
+ + ♻ ☆ CleanCLIP: Mitigating Data Poisoning Attacks in Multimodal Contrastive + Learning ICCV 2023 + + +
+ Multimodal contrastive pretraining has been used to train multimodal +representation models, such as CLIP, on large amounts of paired image-text +data. However, previous studies have revealed that such models are vulnerable +to backdoor attacks. Specifically, when trained on backdoored examples, CLIP +learns spurious correlations between the embedded backdoor trigger and the +target label, aligning their representations in the joint embedding space. +Injecting even a small number of poisoned examples, such as 75 examples in 3 +million pretraining data, can significantly manipulate the model's behavior, +making it difficult to detect or unlearn such correlations. To address this +issue, we propose CleanCLIP, a finetuning framework that weakens the learned +spurious associations introduced by backdoor attacks by independently +re-aligning the representations for individual modalities. We demonstrate that +unsupervised finetuning using a combination of multimodal contrastive and +unimodal self-supervised objectives for individual modalities can significantly +reduce the impact of the backdoor attack. Additionally, we show that supervised +finetuning on task-specific labeled image data removes the backdoor trigger +from the CLIP vision encoder. We show empirically that CleanCLIP maintains +model performance on benign examples while erasing a range of backdoor attacks +on multimodal contrastive learning. The code and checkpoints are available at +https://github.com/nishadsinghi/CleanCLIP. + +
+
+ comment: 22 pages. Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ MAViL: Masked Audio-Video Learners + + +
+ We present Masked Audio-Video Learners (MAViL) to train audio-visual +representations. Our approach learns with three complementary forms of +self-supervision: (1) reconstruction of masked audio and video input data, (2) +intra- and inter-modal contrastive learning with masking, and (3) self-training +by reconstructing joint audio-video contextualized features learned from the +first two objectives. Pre-training with MAViL not only enables the model to +perform well in audio-visual classification and retrieval tasks but also +improves representations of each modality in isolation, without using +information from the other modality for fine-tuning or inference. Empirically, +MAViL sets a new state-of-the-art on AudioSet (53.1 mAP) and VGGSound (67.1% +accuracy). For the first time, a self-supervised audio-visual model outperforms +ones that use external supervision on these benchmarks. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ Feature representations useful for predicting image memorability + + +
+ Prediction of image memorability has attracted interest in various fields. +Consequently, the prediction accuracy of convolutional neural network (CNN) +models has been approaching the empirical upper bound estimated based on human +consistency. However, identifying which feature representations embedded in CNN +models are responsible for the high memorability prediction accuracy remains an +open question. To tackle this problem, we sought to identify +memorability-related feature representations in CNN models using brain +similarity. Specifically, memorability prediction accuracy and brain similarity +were examined across 16,860 layers in 64 CNN models pretrained for object +recognition. A clear tendency was observed in this comprehensive analysis that +layers with high memorability prediction accuracy had higher brain similarity +with the inferior temporal (IT) cortex, which is the highest stage in the +ventral visual pathway. Furthermore, fine-tuning of the 64 CNN models for +memorability prediction revealed that brain similarity with the IT cortex at +the penultimate layer positively correlated with the memorability prediction +accuracy of the models. This analysis also showed that the best fine-tuned +model provided accuracy comparable to state-of-the-art CNN models developed for +memorability prediction. Overall, the results of this study indicated that the +CNN models' great success in predicting memorability relies on feature +representation acquisition, similar to the IT cortex. This study advances our +understanding of feature representations and their use in predicting image +memorability. + +
+
+
+
+
+ + ♻ ☆ PVT++: A Simple End-to-End Latency-Aware Visual Tracking Framework + + +
+ Visual object tracking is essential to intelligent robots. Most existing +approaches have ignored the online latency that can cause severe performance +degradation during real-world processing. Especially for unmanned aerial +vehicles (UAVs), where robust tracking is more challenging and onboard +computation is limited, the latency issue can be fatal. In this work, we +present a simple framework for end-to-end latency-aware tracking, i.e., +end-to-end predictive visual tracking (PVT++). Unlike existing solutions that +naively append Kalman Filters after trackers, PVT++ can be jointly optimized, +so that it takes not only motion information but can also leverage the rich +visual knowledge in most pre-trained tracker models for robust prediction. +Besides, to bridge the training-evaluation domain gap, we propose a relative +motion factor, empowering PVT++ to generalize to the challenging and complex +UAV tracking scenes. These careful designs have made the small-capacity +lightweight PVT++ a widely effective solution. Additionally, this work presents +an extended latency-aware evaluation benchmark for assessing an any-speed +tracker in the online setting. Empirical results on a robotic platform from the +aerial perspective show that PVT++ can achieve significant performance gain on +various trackers and exhibit higher accuracy than prior solutions, largely +mitigating the degradation brought by latency. + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Diagnose Like a Pathologist: Transformer-Enabled Hierarchical + Attention-Guided Multiple Instance Learning for Whole Slide Image + Classification IJCAI2023 + + +
+ Multiple Instance Learning (MIL) and transformers are increasingly popular in +histopathology Whole Slide Image (WSI) classification. However, unlike human +pathologists who selectively observe specific regions of histopathology tissues +under different magnifications, most methods do not incorporate multiple +resolutions of the WSIs, hierarchically and attentively, thereby leading to a +loss of focus on the WSIs and information from other resolutions. To resolve +this issue, we propose a Hierarchical Attention-Guided Multiple Instance +Learning framework to fully exploit the WSIs. This framework can dynamically +and attentively discover the discriminative regions across multiple resolutions +of the WSIs. Within this framework, an Integrated Attention Transformer is +proposed to further enhance the performance of the transformer and obtain a +more holistic WSI (bag) representation. This transformer consists of multiple +Integrated Attention Modules, which is the combination of a transformer layer +and an aggregation module that produces a bag representation based on every +instance representation in that bag. The experimental results show that our +method achieved state-of-the-art performances on multiple datasets, including +Camelyon16, TCGA-RCC, TCGA-NSCLC, and an in-house IMGC dataset. The code is +available at https://github.com/BearCleverProud/HAG-MIL. + +
+
+ comment: Accepted to IJCAI2023 +
+
+
+
+
+ + ♻ ☆ FemtoDet: An Object Detection Baseline for Energy Versus Performance + Tradeoffs ICCV 2023 + + +
+ Efficient detectors for edge devices are often optimized for parameters or +speed count metrics, which remain in weak correlation with the energy of +detectors. + However, some vision applications of convolutional neural networks, such as +always-on surveillance cameras, are critical for energy constraints. + This paper aims to serve as a baseline by designing detectors to reach +tradeoffs between energy and performance from two perspectives: + 1) We extensively analyze various CNNs to identify low-energy architectures, +including selecting activation functions, convolutions operators, and feature +fusion structures on necks. These underappreciated details in past work +seriously affect the energy consumption of detectors; + 2) To break through the dilemmatic energy-performance problem, we propose a +balanced detector driven by energy using discovered low-energy components named +\textit{FemtoDet}. + In addition to the novel construction, we improve FemtoDet by considering +convolutions and training strategy optimizations. + Specifically, we develop a new instance boundary enhancement (IBE) module for +convolution optimization to overcome the contradiction between the limited +capacity of CNNs and detection tasks in diverse spatial representations, and +propose a recursive warm-restart (RecWR) for optimizing training strategy to +escape the sub-optimization of light-weight detectors by considering the data +shift produced in popular augmentations. + As a result, FemtoDet with only 68.77k parameters achieves a competitive +score of 46.3 AP50 on PASCAL VOC and 1.11 W $\&$ 64.47 FPS on Qualcomm +Snapdragon 865 CPU platforms. + Extensive experiments on COCO and TJU-DHD datasets indicate that the proposed +method achieves competitive results in diverse scenes. + +
+
+ comment: 15 pages, accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SeasonDepth: Cross-Season Monocular Depth Prediction Dataset and + Benchmark under Multiple Environments IROS 2023 + + +
+ Different environments pose a great challenge to the outdoor robust visual +perception for long-term autonomous driving, and the generalization of +learning-based algorithms on different environments is still an open problem. +Although monocular depth prediction has been well studied recently, few works +focus on the robustness of learning-based depth prediction across different +environments, e.g. changing illumination and seasons, owing to the lack of such +a multi-environment real-world dataset and benchmark. To this end, the first +cross-season monocular depth prediction dataset and benchmark, SeasonDepth, is +introduced to benchmark the depth estimation performance under different +environments. We investigate several state-of-the-art representative +open-source supervised and self-supervised depth prediction methods using +newly-formulated metrics. Through extensive experimental evaluation on the +proposed dataset and cross-dataset evaluation with current autonomous driving +datasets, the performance and robustness against the influence of multiple +environments are analyzed qualitatively and quantitatively. We show that +long-term monocular depth prediction is still challenging and believe our work +can boost further research on the long-term robustness and generalization for +outdoor visual perception. The dataset is available on +https://seasondepth.github.io, and the benchmark toolkit is available on +https://github.com/ SeasonDepth/SeasonDepth. + +
+
+ comment: Accepted by IROS 2023, 23 pages, 13 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Efficient Large-Scale Visual Representation Learning And Evaluation + + +
+ In this article, we present our approach to single-modality visual +representation learning. Understanding visual representations of items is vital +for fashion recommendations in e-commerce. We detail and contrast techniques +used to finetune large-scale visual representation learning models in an +efficient manner under low-resource settings, including several pretrained +backbone architectures, both in the convolutional neural network as well as the +vision transformer family. We describe the challenges for e-commerce +applications at-scale and highlight the efforts to more efficiently train, +evaluate, and serve visual representations. We present ablation studies +evaluating the representation offline performance for several downstream tasks, +including visually similar ad recommendations on mobile devices. To this end, +we present a novel multilingual text-to-image generative offline evaluation +method for visually similar recommendation systems. Finally, we include online +results from deployed machine learning systems in production at Etsy. + +
+
+
+
+
+ + ♻ ☆ Segment Anything in Medical Images + + +
+ Medical image segmentation is a critical component in clinical practice, +facilitating accurate diagnosis, treatment planning, and disease monitoring. +However, current methods predominantly rely on customized models, which exhibit +limited generality across diverse tasks. In this study, we present MedSAM, the +inaugural foundation model designed for universal medical image segmentation. +Harnessing the power of a meticulously curated dataset comprising over one +million images, MedSAM not only outperforms existing state-of-the-art +segmentation foundation models, but also exhibits comparable or even superior +performance to specialist models. Moreover, MedSAM enables the precise +extraction of essential biomarkers for tumor burden quantification. By +delivering accurate and efficient segmentation across a wide spectrum of tasks, +MedSAM holds significant potential to expedite the evolution of diagnostic +tools and the personalization of treatment plans. + +
+
+
+
+
+ + ♻ ☆ Heat Demand Forecasting with Multi-Resolutional Representation of + Heterogeneous Temporal Ensemble + + +
+ One of the primal challenges faced by utility companies is ensuring efficient +supply with minimal greenhouse gas emissions. The advent of smart meters and +smart grids provide an unprecedented advantage in realizing an optimised supply +of thermal energies through proactive techniques such as load forecasting. In +this paper, we propose a forecasting framework for heat demand based on neural +networks where the time series are encoded as scalograms equipped with the +capacity of embedding exogenous variables such as weather, and +holiday/non-holiday. Subsequently, CNNs are utilized to predict the heat load +multi-step ahead. Finally, the proposed framework is compared with other +state-of-the-art methods, such as SARIMAX and LSTM. The quantitative results +from retrospective experiments show that the proposed framework consistently +outperforms the state-of-the-art baseline method with real-world data acquired +from Denmark. A minimal mean error of 7.54% for MAPE and 417kW for RMSE is +achieved with the proposed framework in comparison to all other methods. + +
+
+ comment: https://www.climatechange.ai/papers/neurips2022/46 +
+
+
+
+
+ + ♻ ☆ Performance Gaps of Artificial Intelligence Models Screening Mammography + -- Towards Fair and Interpretable Models + + +
+ Even though deep learning models for abnormality classification can perform +well in screening mammography, the demographic and imaging characteristics +associated with increased risk of failure for abnormality classification in +screening mammograms remain unclear. This retrospective study used data from +the Emory BrEast Imaging Dataset (EMBED) including mammograms from 115,931 +patients imaged at Emory University Healthcare between 2013 to 2020. Clinical +and imaging data includes Breast Imaging Reporting and Data System (BI-RADS) +assessment, region of interest coordinates for abnormalities, imaging features, +pathologic outcomes, and patient demographics. Deep learning models including +InceptionV3, VGG16, ResNet50V2, and ResNet152V2 were developed to distinguish +between patches of abnormal tissue and randomly selected patches of normal +tissue from the screening mammograms. The distributions of the training, +validation and test sets are 29,144 (55.6%) patches of 10,678 (54.2%) patients, +9,910 (18.9%) patches of 3,609 (18.3%) patients, and 13,390 (25.5%) patches of +5,404 (27.5%) patients. We assessed model performance overall and within +subgroups defined by age, race, pathologic outcome, and imaging characteristics +to evaluate reasons for misclassifications. On the test set, a ResNet152V2 +model trained to classify normal versus abnormal tissue patches achieved an +accuracy of 92.6% (95%CI=92.0-93.2%), and area under the receiver operative +characteristics curve 0.975 (95%CI=0.972-0.978). Imaging characteristics +associated with higher misclassifications of images include higher tissue +densities (risk ratio [RR]=1.649; p=.010, BI-RADS density C and RR=2.026; +p=.003, BI-RADS density D), and presence of architectural distortion (RR=1.026; +p<.001). Small but statistically significant differences in performance were +observed by age, race, pathologic outcome, and other imaging features (p<.001). + +
+
+ comment: 21 pages, 4 tables, 5 figures, 2 supplemental table and 1 + supplemental figure +
+
+
+
+
+ + ♻ ☆ Implicit Anatomical Rendering for Medical Image Segmentation with + Stochastic Experts MICCAI 2023 + + +
+ Integrating high-level semantically correlated contents and low-level +anatomical features is of central importance in medical image segmentation. +Towards this end, recent deep learning-based medical segmentation methods have +shown great promise in better modeling such information. However, convolution +operators for medical segmentation typically operate on regular grids, which +inherently blur the high-frequency regions, i.e., boundary regions. In this +work, we propose MORSE, a generic implicit neural rendering framework designed +at an anatomical level to assist learning in medical image segmentation. Our +method is motivated by the fact that implicit neural representation has been +shown to be more effective in fitting complex signals and solving computer +graphics problems than discrete grid-based representation. The core of our +approach is to formulate medical image segmentation as a rendering problem in +an end-to-end manner. Specifically, we continuously align the coarse +segmentation prediction with the ambiguous coordinate-based point +representations and aggregate these features to adaptively refine the boundary +region. To parallelly optimize multi-scale pixel-level features, we leverage +the idea from Mixture-of-Expert (MoE) to design and train our MORSE with a +stochastic gating mechanism. Our experiments demonstrate that MORSE can work +well with different medical segmentation backbones, consistently achieving +competitive performance improvements in both 2D and 3D supervised medical +segmentation methods. We also theoretically analyze the superiority of MORSE. + +
+
+ comment: Accepted at International Conference on Medical Image Computing and + Computer-Assisted Intervention (MICCAI 2023) +
+
+
+
+
+ + ♻ ☆ ACTION++: Improving Semi-supervised Medical Image Segmentation with + Adaptive Anatomical Contrast MICCAI 2023 + + +
+ Medical data often exhibits long-tail distributions with heavy class +imbalance, which naturally leads to difficulty in classifying the minority +classes (i.e., boundary regions or rare objects). Recent work has significantly +improved semi-supervised medical image segmentation in long-tailed scenarios by +equipping them with unsupervised contrastive criteria. However, it remains +unclear how well they will perform in the labeled portion of data where class +distribution is also highly imbalanced. In this work, we present ACTION++, an +improved contrastive learning framework with adaptive anatomical contrast for +semi-supervised medical segmentation. Specifically, we propose an adaptive +supervised contrastive loss, where we first compute the optimal locations of +class centers uniformly distributed on the embedding space (i.e., off-line), +and then perform online contrastive matching training by encouraging different +class features to adaptively match these distinct and uniformly distributed +class centers. Moreover, we argue that blindly adopting a constant temperature +$\tau$ in the contrastive loss on long-tailed medical data is not optimal, and +propose to use a dynamic $\tau$ via a simple cosine schedule to yield better +separation between majority and minority classes. Empirically, we evaluate +ACTION++ on ACDC and LA benchmarks and show that it achieves state-of-the-art +across two semi-supervised settings. Theoretically, we analyze the performance +of adaptive anatomical contrast and confirm its superiority in label +efficiency. + +
+
+ comment: Accepted by International Conference on Medical Image Computing and + Computer-Assisted Intervention (MICCAI 2023) +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ Leveraging Recommender Systems to Reduce Content Gaps on Peer Production + Platforms AAAI + + +
+ Peer production platforms like Wikipedia commonly suffer from content gaps. +Prior research suggests recommender systems can help solve this problem, by +guiding editors towards underrepresented topics. However, it remains unclear +whether this approach would result in less relevant recommendations, leading to +reduced overall engagement with recommended items. To answer this question, we +first conducted offline analyses (Study 1) on SuggestBot, a task-routing +recommender system for Wikipedia, then did a three-month controlled experiment +(Study 2). Our results show that presenting users with articles from +underrepresented topics increased the proportion of work done on those articles +without significantly reducing overall recommendation uptake. We discuss the +implications of our results, including how ignoring the article discovery +process can artificially narrow recommendations. We draw parallels between this +phenomenon and the common issue of ``filter bubbles'' to show how any platform +that employs recommender systems is susceptible to it. + +
+
+ comment: To appear at the 18th International AAAI Conference on Web and Social + Media (ICWSM 2024) +
+
+
+
+
+ + ☆ Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language + Models + + +
+ Dense retrieval (DR) converts queries and documents into dense embeddings and +measures the similarity between queries and documents in vector space. One of +the challenges in DR is the lack of domain-specific training data. While DR +models can learn from large-scale public datasets like MS MARCO through +transfer learning, evidence shows that not all DR models and domains can +benefit from transfer learning equally. Recently, some researchers have +resorted to large language models (LLMs) to improve the zero-shot and few-shot +DR models. However, the hard prompts or human-written prompts utilized in these +works cannot guarantee the good quality of generated weak queries. To tackle +this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task, +we leverage soft prompt-tuning to optimize a task-specific soft prompt on +limited ground truth data and then prompt the LLMs to tag unlabeled documents +with weak queries, yielding enough weak document-query pairs to train +task-specific dense retrievers. We design a filter to select high-quality +example document-query pairs in the prompt to further improve the quality of +weak tagged queries. To the best of our knowledge, there is no prior work +utilizing soft prompt tuning to augment DR models. The experiments demonstrate +that SPTAR outperforms the unsupervised baselines BM25 and the recently +proposed LLMs-based augmentation method for DR. + +
+
+
+
+
+ + ☆ Measuring Item Global Residual Value for Fair Recommendation + + +
+ In the era of information explosion, numerous items emerge every day, +especially in feed scenarios. Due to the limited system display slots and user +browsing attention, various recommendation systems are designed not only to +satisfy users' personalized information needs but also to allocate items' +exposure. However, recent recommendation studies mainly focus on modeling user +preferences to present satisfying results and maximize user interactions, while +paying little attention to developing item-side fair exposure mechanisms for +rational information delivery. This may lead to serious resource allocation +problems on the item side, such as the Snowball Effect. Furthermore, unfair +exposure mechanisms may hurt recommendation performance. In this paper, we call +for a shift of attention from modeling user preferences to developing fair +exposure mechanisms for items. We first conduct empirical analyses of feed +scenarios to explore exposure problems between items with distinct uploaded +times. This points out that unfair exposure caused by the time factor may be +the major cause of the Snowball Effect. Then, we propose to explicitly model +item-level customized timeliness distribution, Global Residual Value (GRV), for +fair resource allocation. This GRV module is introduced into recommendations +with the designed Timeliness-aware Fair Recommendation Framework (TaFR). +Extensive experiments on two datasets demonstrate that TaFR achieves consistent +improvements with various backbone recommendation models. By modeling item-side +customized Global Residual Value, we achieve a fairer distribution of resources +and, at the same time, improve recommendation performance. + +
+
+
+
+
+ + ☆ An Admissible Shift-Consistent Method for Recommender Systems + + +
+ In this paper, we propose a new constraint, called shift-consistency, for +solving matrix/tensor completion problems in the context of recommender +systems. Our method provably guarantees several key mathematical properties: +(1) satisfies a recently established admissibility criterion for recommender +systems; (2) satisfies a definition of fairness that eliminates a specific +class of potential opportunities for users to maliciously influence system +recommendations; and (3) offers robustness by exploiting provable uniqueness of +missing-value imputation. We provide a rigorous mathematical description of the +method, including its generalization from matrix to tensor form to permit +representation and exploitation of complex structural relationships among sets +of user and product attributes. We argue that our analysis suggests a +structured means for defining latent-space projections that can permit provable +performance properties to be established for machine learning methods. + +
+
+
+
+
+ + ☆ An Exploration Study of Mixed-initiative Query Reformulation in + Conversational Passage Retrieval + + +
+ In this paper, we report our methods and experiments for the TREC +Conversational Assistance Track (CAsT) 2022. In this work, we aim to reproduce +multi-stage retrieval pipelines and explore one of the potential benefits of +involving mixed-initiative interaction in conversational passage retrieval +scenarios: reformulating raw queries. Before the first ranking stage of a +multi-stage retrieval pipeline, we propose a mixed-initiative query +reformulation module, which achieves query reformulation based on the +mixed-initiative interaction between the users and the system, as the +replacement for the neural reformulation method. Specifically, we design an +algorithm to generate appropriate questions related to the ambiguities in raw +queries, and another algorithm to reformulate raw queries by parsing users' +feedback and incorporating it into the raw query. For the first ranking stage +of our multi-stage pipelines, we adopt a sparse ranking function: BM25, and a +dense retrieval method: TCT-ColBERT. For the second-ranking step, we adopt a +pointwise reranker: MonoT5, and a pairwise reranker: DuoT5. Experiments on both +TREC CAsT 2021 and TREC CAsT 2022 datasets show the effectiveness of our +mixed-initiative-based query reformulation method on improving retrieval +performance compared with two popular reformulators: a neural reformulator: +CANARD-T5 and a rule-based reformulator: historical query reformulator(HQE). + +
+
+ comment: The Thirty-First Text REtrieval Conference (TREC 2022) Proceedings +
+
+
+
+
+ + ☆ Imposing Consistency Properties on Blackbox Systems with Applications to + SVD-Based Recommender Systems + + +
+ In this paper we discuss pre- and post-processing methods to induce desired +consistency and/or invariance properties in blackbox systems, e.g., AI-based. +We demonstrate our approach in the context of blackbox SVD-based +matrix-completion methods commonly used in recommender system (RS) +applications. We provide empirical results showing that enforcement of +unit-consistency and shift-consistency, which have provable RS-relevant +properties relating to robustness and fairness, also lead to improved +performance according to generic RMSE and MAE performance metrics, irrespective +of the initial chosen hyperparameter. + +
+
+
+
+
+ + ☆ Automated Action Model Acquisition from Narrative Texts + + +
+ Action models, which take the form of precondition/effect axioms, facilitate +causal and motivational connections between actions for AI agents. Action model +acquisition has been identified as a bottleneck in the application of planning +technology, especially within narrative planning. Acquiring action models from +narrative texts in an automated way is essential, but challenging because of +the inherent complexities of such texts. We present NaRuto, a system that +extracts structured events from narrative text and subsequently generates +planning-language-style action models based on predictions of commonsense event +relations, as well as textual contradictions and similarities, in an +unsupervised manner. Experimental results in classical narrative planning +domains show that NaRuto can generate action models of significantly better +quality than existing fully automated methods, and even on par with those of +semi-automated methods. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Evaluating and Enhancing Robustness of Deep Recommendation Systems + Against Hardware Errors + + +
+ Deep recommendation systems (DRS) heavily depend on specialized HPC hardware +and accelerators to optimize energy, efficiency, and recommendation quality. +Despite the growing number of hardware errors observed in large-scale fleet +systems where DRS are deployed, the robustness of DRS has been largely +overlooked. This paper presents the first systematic study of DRS robustness +against hardware errors. We develop Terrorch, a user-friendly, efficient and +flexible error injection framework on top of the widely-used PyTorch. We +evaluate a wide range of models and datasets and observe that the DRS +robustness against hardware errors is influenced by various factors from model +parameters to input characteristics. We also explore 3 error mitigation methods +including algorithm based fault tolerance (ABFT), activation clipping and +selective bit protection (SBP). We find that applying activation clipping can +recover up to 30% of the degraded AUC-ROC score, making it a promising +mitigation method. + +
+
+
+
+
+ + ♻ ☆ Visualising Personal Data Flows: Insights from a Case Study of + Booking.com + + +
+ Commercial organisations are holding and processing an ever-increasing amount +of personal data. Policies and laws are continually changing to require these +companies to be more transparent regarding the collection, storage, processing +and sharing of this data. This paper reports our work of taking Booking.com as +a case study to visualise personal data flows extracted from their privacy +policy. By showcasing how the company shares its consumers' personal data, we +raise questions and extend discussions on the challenges and limitations of +using privacy policies to inform online users about the true scale and the +landscape of personal data flows. This case study can inform us about future +research on more data flow-oriented privacy policy analysis and on the +construction of a more comprehensive ontology on personal data flows in +complicated business ecosystems. + +
+
+ comment: This is the full edition of a paper published in Intelligent + Information Systems: CAiSE Forum 2023, Zaragoza, Spain, June 12-16, 2023, + Proceedings, Lecture Notes in Business Information Processing (LNBIP), Volume + 477, pp. 52-60, 2023, Springer Nature, + https://link.springer.com/book/10.1007/978-3-031-34674-3 +
+
+
+
+
+ + ♻ ☆ Parmesan: mathematical concept extraction for education + + +
+ Mathematics is a highly specialized domain with its own unique set of +challenges that has seen limited study in natural language processing. However, +mathematics is used in a wide variety of fields and multidisciplinary research +in many different domains often relies on an understanding of mathematical +concepts. To aid researchers coming from other fields, we develop a prototype +system for searching for and defining mathematical concepts in context, +focusing on the field of category theory. This system, Parmesan, depends on +natural language processing components including concept extraction, relation +extraction, definition extraction, and entity linking. In developing this +system, we show that existing techniques cannot be applied directly to the +category theory domain, and suggest hybrid techniques that do perform well, +though we expect the system to evolve over time. We also provide two cleaned +mathematical corpora that power the prototype system, which are based on +journal articles and wiki pages, respectively. The corpora have been annotated +with dependency trees, lemmas, and part-of-speech tags. + +
+
+
+
+
+ + ♻ ☆ Efficiently Leveraging Multi-level User Intent for Session-based + Recommendation via Atten-Mixer Network + + +
+ Session-based recommendation (SBR) aims to predict the user's next action +based on short and dynamic sessions. Recently, there has been an increasing +interest in utilizing various elaborately designed graph neural networks (GNNs) +to capture the pair-wise relationships among items, seemingly suggesting the +design of more complicated models is the panacea for improving the empirical +performance. However, these models achieve relatively marginal improvements +with exponential growth in model complexity. In this paper, we dissect the +classical GNN-based SBR models and empirically find that some sophisticated GNN +propagations are redundant, given the readout module plays a significant role +in GNN-based models. Based on this observation, we intuitively propose to +remove the GNN propagation part, while the readout module will take on more +responsibility in the model reasoning process. To this end, we propose the +Multi-Level Attention Mixture Network (Atten-Mixer), which leverages both +concept-view and instance-view readouts to achieve multi-level reasoning over +item transitions. As simply enumerating all possible high-level concepts is +infeasible for large real-world recommender systems, we further incorporate +SBR-related inductive biases, i.e., local invariance and inherent priority to +prune the search space. Experiments on three benchmarks demonstrate the +effectiveness and efficiency of our proposal. We also have already launched the +proposed techniques to a large-scale e-commercial online service since April +2021, with significant improvements of top-tier business metrics demonstrated +in the online experiments on live traffic. + +
+
+
+
+
+
+
+
+ + Machine Learning 137 + +
+
+
+ + ☆ Flow Matching in Latent Space + + +
+ Flow matching is a recent framework to train generative models that exhibits +impressive empirical performance while being relatively easier to train +compared with diffusion-based models. Despite its advantageous properties, +prior methods still face the challenges of expensive computing and a large +number of function evaluations of off-the-shelf solvers in the pixel space. +Furthermore, although latent-based generative methods have shown great success +in recent years, this particular model type remains underexplored in this area. +In this work, we propose to apply flow matching in the latent spaces of +pretrained autoencoders, which offers improved computational efficiency and +scalability for high-resolution image synthesis. This enables flow-matching +training on constrained computational resources while maintaining their quality +and flexibility. Additionally, our work stands as a pioneering contribution in +the integration of various conditions into flow matching for conditional +generation tasks, including label-conditioned image generation, image +inpainting, and semantic-to-image generation. Through extensive experiments, +our approach demonstrates its effectiveness in both quantitative and +qualitative results on various datasets, such as CelebA-HQ, FFHQ, LSUN Church & +Bedroom, and ImageNet. We also provide a theoretical control of the +Wasserstein-2 distance between the reconstructed latent flow distribution and +true data distribution, showing it is upper-bounded by the latent flow matching +objective. Our code will be available at +https://github.com/VinAIResearch/LFM.git. + +
+
+ comment: Project Page: https://vinairesearch.github.io/LFM/ +
+
+
+
+
+ + ☆ A Multiobjective Reinforcement Learning Framework for Microgrid Energy + Management + + +
+ The emergence of microgrids (MGs) has provided a promising solution for +decarbonizing and decentralizing the power grid, mitigating the challenges +posed by climate change. However, MG operations often involve considering +multiple objectives that represent the interests of different stakeholders, +leading to potentially complex conflicts. To tackle this issue, we propose a +novel multi-objective reinforcement learning framework that explores the +high-dimensional objective space and uncovers the tradeoffs between conflicting +objectives. This framework leverages exogenous information and capitalizes on +the data-driven nature of reinforcement learning, enabling the training of a +parametric policy without the need for long-term forecasts or knowledge of the +underlying uncertainty distribution. The trained policies exhibit diverse, +adaptive, and coordinative behaviors with the added benefit of providing +interpretable insights on the dynamics of their information use. We employ this +framework on the Cornell University MG (CU-MG), which is a combined heat and +power MG, to evaluate its effectiveness. The results demonstrate performance +improvements in all objectives considered compared to the status quo operations +and offer more flexibility in navigating complex operational tradeoffs. + +
+
+ comment: This work will be submitted to the IEEE Transactions on Smart Grid + for possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ FlashAttention-2: Faster Attention with Better Parallelism and Work + Partitioning + + +
+ Scaling Transformers to longer sequence lengths has been a major problem in +the last several years, promising to improve performance in language modeling +and high-resolution image understanding, as well as to unlock new applications +in code, audio, and video generation. The attention layer is the main +bottleneck in scaling to longer sequences, as its runtime and memory increase +quadratically in the sequence length. FlashAttention exploits the asymmetric +GPU memory hierarchy to bring significant memory saving (linear instead of +quadratic) and runtime speedup (2-4$\times$ compared to optimized baselines), +with no approximation. However, FlashAttention is still not nearly as fast as +optimized matrix-multiply (GEMM) operations, reaching only 25-40\% of the +theoretical maximum FLOPs/s. We observe that the inefficiency is due to +suboptimal work partitioning between different thread blocks and warps on the +GPU, causing either low-occupancy or unnecessary shared memory reads/writes. We +propose FlashAttention-2, with better work partitioning to address these +issues. In particular, we (1) tweak the algorithm to reduce the number of +non-matmul FLOPs (2) parallelize the attention computation, even for a single +head, across different thread blocks to increase occupancy, and (3) within each +thread block, distribute the work between warps to reduce communication through +shared memory. These yield around 2$\times$ speedup compared to FlashAttention, +reaching 50-73\% of the theoretical maximum FLOPs/s on A100 and getting close +to the efficiency of GEMM operations. We empirically validate that when used +end-to-end to train GPT-style models, FlashAttention-2 reaches training speed +of up to 225 TFLOPs/s per A100 GPU (72\% model FLOPs utilization). + +
+
+
+
+
+ + ☆ COLLIE: Systematic Construction of Constrained Text Generation Tasks + + +
+ Text generation under constraints have seen increasing interests in natural +language processing, especially with the rapidly improving capabilities of +large language models. However, existing benchmarks for constrained generation +usually focus on fixed constraint types (e.g.,generate a sentence containing +certain words) that have proved to be easy for state-of-the-art models like +GPT-4. We present COLLIE, a grammar-based framework that allows the +specification of rich, compositional constraints with diverse generation levels +(word, sentence, paragraph, passage) and modeling challenges (e.g.,language +understanding, logical reasoning, counting, semantic planning). We also develop +tools for automatic extraction of task instances given a constraint structure +and a raw text corpus. Using COLLIE, we compile the COLLIE-v1 dataset with 2080 +instances comprising 13 constraint structures. We perform systematic +experiments across five state-of-the-art instruction-tuned language models and +analyze their performances to reveal shortcomings. COLLIE is designed to be +extensible and lightweight, and we hope the community finds it useful to +develop more complex constraints and evaluations in the future. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ An R package for parametric estimation of causal effects + + +
+ This article explains the usage of R package CausalModels, which is publicly +available on the Comprehensive R Archive Network. While packages are available +for sufficiently estimating causal effects, there lacks a package that provides +a collection of structural models using the conventional statistical approach +developed by Hern\'an and Robins (2020). CausalModels addresses this deficiency +of software in R concerning causal inference by offering tools for methods that +account for biases in observational data without requiring extensive +statistical knowledge. These methods should not be ignored and may be more +appropriate or efficient in solving particular problems. While implementations +of these statistical models are distributed among a number of causal packages, +CausalModels introduces a simple and accessible framework for a consistent +modeling pipeline among a variety of statistical methods for estimating causal +effects in a single R package. It consists of common methods including +standardization, IP weighting, G-estimation, outcome regression, instrumental +variables and propensity matching. + +
+
+
+
+
+ + ☆ A Rubik's Cube inspired approach to Clifford synthesis + + +
+ The problem of decomposing an arbitrary Clifford element into a sequence of +Clifford gates is known as Clifford synthesis. Drawing inspiration from +similarities between this and the famous Rubik's Cube problem, we develop a +machine learning approach for Clifford synthesis based on learning an +approximation to the distance to the identity. This approach is probabilistic +and computationally intensive. However, when a decomposition is successfully +found, it often involves fewer gates than existing synthesis algorithms. +Additionally, our approach is much more flexible than existing algorithms in +that arbitrary gate sets, device topologies, and gate fidelities may +incorporated, thus allowing for the approach to be tailored to a specific +device. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Do Models Explain Themselves? Counterfactual Simulatability of Natural + Language Explanations + + +
+ Large language models (LLMs) are trained to imitate humans to explain human +decisions. However, do LLMs explain themselves? Can they help humans build +mental models of how LLMs process different inputs? To answer these questions, +we propose to evaluate $\textbf{counterfactual simulatability}$ of natural +language explanations: whether an explanation can enable humans to precisely +infer the model's outputs on diverse counterfactuals of the explained input. +For example, if a model answers "yes" to the input question "Can eagles fly?" +with the explanation "all birds can fly", then humans would infer from the +explanation that it would also answer "yes" to the counterfactual input "Can +penguins fly?". If the explanation is precise, then the model's answer should +match humans' expectations. + We implemented two metrics based on counterfactual simulatability: precision +and generality. We generated diverse counterfactuals automatically using LLMs. +We then used these metrics to evaluate state-of-the-art LLMs (e.g., GPT-4) on +two tasks: multi-hop factual reasoning and reward modeling. We found that LLM's +explanations have low precision and that precision does not correlate with +plausibility. Therefore, naively optimizing human approvals (e.g., RLHF) may +not be a sufficient solution. + +
+
+
+
+
+ + ☆ TableGPT: Towards Unifying Tables, Nature Language and Commands into One + GPT + + +
+ Tables are prevalent in real-world databases, requiring significant time and +effort for humans to analyze and manipulate. The advancements in large language +models (LLMs) have made it possible to interact with tables using natural +language input, bringing this capability closer to reality. In this paper, we +present TableGPT, a unified fine-tuned framework that enables LLMs to +understand and operate on tables using external functional commands. It +introduces the capability to seamlessly interact with tables, enabling a wide +range of functionalities such as question answering, data manipulation (e.g., +insert, delete, query, and modify operations), data visualization, analysis +report generation, and automated prediction. TableGPT aims to provide +convenience and accessibility to users by empowering them to effortlessly +leverage tabular data. At the core of TableGPT lies the novel concept of global +tabular representations, which empowers LLMs to gain a comprehensive +understanding of the entire table beyond meta-information. By jointly training +LLMs on both table and text modalities, TableGPT achieves a deep understanding +of tabular data and the ability to perform complex operations on tables through +chain-of-command instructions. Importantly, TableGPT offers the advantage of +being a self-contained system rather than relying on external API interfaces. +Moreover, it supports efficient data process flow, query rejection (when +appropriate) and private deployment, enabling faster domain data fine-tuning +and ensuring data privacy, which enhances the framework's adaptability to +specific use cases. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ CohortFinder: an open-source tool for data-driven partitioning of + biomedical image cohorts to yield robust machine learning models + + +
+ Batch effects (BEs) refer to systematic technical differences in data +collection unrelated to biological variations whose noise is shown to +negatively impact machine learning (ML) model generalizability. Here we release +CohortFinder, an open-source tool aimed at mitigating BEs via data-driven +cohort partitioning. We demonstrate CohortFinder improves ML model performance +in downstream medical image processing tasks. CohortFinder is freely available +for download at cohortfinder.com. + +
+
+ comment: 26 pages, 9 figures, 4 tables. Abstract was accepted by European + Society of Digital and Integrative Pathology (ESDIP), Germany, 2022 +
+
+
+
+
+ + ☆ Neural Image Compression: Generalization, Robustness, and Spectral + Biases + + +
+ Recent neural image compression (NIC) advances have produced models which are +starting to outperform traditional codecs. While this has led to growing +excitement about using NIC in real-world applications, the successful adoption +of any machine learning system in the wild requires it to generalize (and be +robust) to unseen distribution shifts at deployment. Unfortunately, current +research lacks comprehensive datasets and informative tools to evaluate and +understand NIC performance in real-world settings. To bridge this crucial gap, +first, this paper presents a comprehensive benchmark suite to evaluate the +out-of-distribution (OOD) performance of image compression methods. +Specifically, we provide CLIC-C and Kodak-C by introducing 15 corruptions to +popular CLIC and Kodak benchmarks. Next, we propose spectrally inspired +inspection tools to gain deeper insight into errors introduced by image +compression methods as well as their OOD performance. We then carry out a +detailed performance comparison of a classical codec with several NIC variants, +revealing intriguing findings that challenge our current understanding of the +strengths and limitations of NIC. Finally, we corroborate our empirical +findings with theoretical analysis, providing an in-depth view of the OOD +performance of NIC and its dependence on the spectral properties of the data. +Our benchmarks, spectral inspection tools, and findings provide a crucial +bridge to the real-world adoption of NIC. We hope that our work will propel +future efforts in designing robust and generalizable NIC methods. Code and data +will be made available at https://github.com/klieberman/ood_nic. + +
+
+
+
+
+ + ☆ A General Framework for Learning under Corruption: Label Noise, + Attribute Noise, and Beyond + + +
+ Corruption is frequently observed in collected data and has been extensively +studied in machine learning under different corruption models. Despite this, +there remains a limited understanding of how these models relate such that a +unified view of corruptions and their consequences on learning is still +lacking. In this work, we formally analyze corruption models at the +distribution level through a general, exhaustive framework based on Markov +kernels. We highlight the existence of intricate joint and dependent +corruptions on both labels and attributes, which are rarely touched by existing +research. Further, we show how these corruptions affect standard supervised +learning by analyzing the resulting changes in Bayes Risk. Our findings offer +qualitative insights into the consequences of "more complex" corruptions on the +learning problem, and provide a foundation for future quantitative comparisons. +Applications of the framework include corruption-corrected learning, a subcase +of which we study in this paper by theoretically analyzing loss correction with +respect to different corruption instances. + +
+
+ comment: 42 pages +
+
+
+
+
+ + ☆ LearnedSort as a learning-augmented SampleSort: Analysis and + Parallelization + + +
+ This work analyzes and parallelizes LearnedSort, the novel algorithm that +sorts using machine learning models based on the cumulative distribution +function. LearnedSort is analyzed under the lens of algorithms with +predictions, and it is argued that LearnedSort is a learning-augmented +SampleSort. A parallel LearnedSort algorithm is developed combining LearnedSort +with the state-of-the-art SampleSort implementation, IPS4o. Benchmarks on +synthetic and real-world datasets demonstrate improved parallel performance for +parallel LearnedSort compared to IPS4o and other sorting algorithms. + +
+
+ comment: Published in SSDBM 2023 +
+
+
+
+
+ + ☆ Retentive Network: A Successor to Transformer for Large Language Models + + +
+ In this work, we propose Retentive Network (RetNet) as a foundation +architecture for large language models, simultaneously achieving training +parallelism, low-cost inference, and good performance. We theoretically derive +the connection between recurrence and attention. Then we propose the retention +mechanism for sequence modeling, which supports three computation paradigms, +i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel +representation allows for training parallelism. The recurrent representation +enables low-cost $O(1)$ inference, which improves decoding throughput, latency, +and GPU memory without sacrificing performance. The chunkwise recurrent +representation facilitates efficient long-sequence modeling with linear +complexity, where each chunk is encoded parallelly while recurrently +summarizing the chunks. Experimental results on language modeling show that +RetNet achieves favorable scaling results, parallel training, low-cost +deployment, and efficient inference. The intriguing properties make RetNet a +strong successor to Transformer for large language models. Code will be +available at https://aka.ms/retnet. + +
+
+
+
+
+ + ☆ Understanding the impacts of crop diversification in the context of + climate change: a machine learning approach SP + + +
+ The concept of sustainable intensification in agriculture necessitates the +implementation of management practices that prioritize sustainability without +compromising productivity. However, the effects of such practices are known to +depend on environmental conditions, and are therefore expected to change as a +result of a changing climate. We study the impact of crop diversification on +productivity in the context of climate change. We leverage heterogeneous Earth +Observation data and contribute a data-driven approach based on causal machine +learning for understanding how crop diversification impacts may change in the +future. We apply this method to the country of Cyprus throughout a 4-year +period. We find that, on average, crop diversification significantly benefited +the net primary productivity of crops, increasing it by 2.8%. The effect +generally synergized well with higher maximum temperatures and lower soil +moistures. In a warmer and more drought-prone climate, we conclude that crop +diversification exhibits promising adaptation potential and is thus a sensible +policy choice with regards to agricultural productivity for present and future. + +
+
+ comment: Accepted for oral presentation at ISPRS Geospatial Week 2023 +
+
+
+
+
+ + ☆ Temporal and Geographical Analysis of Real Economic Activities in the + Bitcoin Blockchain + + +
+ We study the real economic activity in the Bitcoin blockchain that involves +transactions from/to retail users rather than between organizations such as +marketplaces, exchanges, or other services. We first introduce a heuristic +method to classify Bitcoin players into three main categories: Frequent +Receivers (FR), Neighbors of FR, and Others. We show that most real +transactions involve Frequent Receivers, representing a small fraction of the +total value exchanged according to the blockchain, but a significant fraction +of all payments, raising concerns about the centralization of the Bitcoin +ecosystem. We also conduct a weekly pattern analysis of activity, providing +insights into the geographical location of Bitcoin users and allowing us to +quantify the bias of a well-known dataset for actor identification. + +
+
+
+
+
+ + ☆ Artificial Intelligence for the Electron Ion Collider (AI4EIC) + + +
+ The Electron-Ion Collider (EIC), a state-of-the-art facility for studying the +strong force, is expected to begin commissioning its first experiments in 2028. +This is an opportune time for artificial intelligence (AI) to be included from +the start at this facility and in all phases that lead up to the experiments. +The second annual workshop organized by the AI4EIC working group, which +recently took place, centered on exploring all current and prospective +application areas of AI for the EIC. This workshop is not only beneficial for +the EIC, but also provides valuable insights for the newly established ePIC +collaboration at EIC. This paper summarizes the different activities and R&D +projects covered across the sessions of the workshop and provides an overview +of the goals, approaches and strategies regarding AI/ML in the EIC community, +as well as cutting-edge techniques currently studied in other experiments. + +
+
+ comment: 27 pages, 11 figures, AI4EIC workshop, tutorials and hackathon +
+
+
+
+
+ + ☆ Snapshot Spectral Clustering -- a costless approach to deep clustering + ensembles generation + + +
+ Despite tremendous advancements in Artificial Intelligence, learning from +large sets of data in an unsupervised manner remains a significant challenge. +Classical clustering algorithms often fail to discover complex dependencies in +large datasets, especially considering sparse, high-dimensional spaces. +However, deep learning techniques proved to be successful when dealing with +large quantities of data, efficiently reducing their dimensionality without +losing track of underlying information. Several interesting advancements have +already been made to combine deep learning and clustering. Still, the idea of +enhancing the clustering results by combining multiple views of the data +generated by deep neural networks appears to be insufficiently explored yet. +This paper aims to investigate this direction and bridge the gap between deep +neural networks, clustering techniques and ensemble learning methods. To +achieve this goal, we propose a novel deep clustering ensemble method - +Snapshot Spectral Clustering, designed to maximize the gain from combining +multiple data views while minimizing the computational costs of creating the +ensemble. Comparative analysis and experiments described in this paper prove +the proposed concept, while the conducted hyperparameter study provides a +valuable intuition to follow when selecting proper values. + +
+
+ comment: In proceedings of the International Joint Conference on Neural + Networks 2023 +
+
+
+
+
+ + ☆ A Study on the Performance of Generative Pre-trained Transformer (GPT) + in Simulating Depressed Individuals on the Standardized Depressive Symptom + Scale + + +
+ Background: Depression is a common mental disorder with societal and economic +burden. Current diagnosis relies on self-reports and assessment scales, which +have reliability issues. Objective approaches are needed for diagnosing +depression. Objective: Evaluate the potential of GPT technology in diagnosing +depression. Assess its ability to simulate individuals with depression and +investigate the influence of depression scales. Methods: Three +depression-related assessment tools (HAMD-17, SDS, GDS-15) were used. Two +experiments simulated GPT responses to normal individuals and individuals with +depression. Compare GPT's responses with expected results, assess its +understanding of depressive symptoms, and performance differences under +different conditions. Results: GPT's performance in depression assessment was +evaluated. It aligned with scoring criteria for both individuals with +depression and normal individuals. Some performance differences were observed +based on depression severity. GPT performed better on scales with higher +sensitivity. Conclusion: GPT accurately simulates individuals with depression +and normal individuals during depression-related assessments. Deviations occur +when simulating different degrees of depression, limiting understanding of mild +and moderate cases. GPT performs better on scales with higher sensitivity, +indicating potential for developing more effective depression scales. GPT has +important potential in depression assessment, supporting clinicians and +patients. + +
+
+
+
+
+ + ☆ FedCME: Client Matching and Classifier Exchanging to Handle Data + Heterogeneity in Federated Learning + + +
+ Data heterogeneity across clients is one of the key challenges in Federated +Learning (FL), which may slow down the global model convergence and even weaken +global model performance. Most existing approaches tackle the heterogeneity by +constraining local model updates through reference to global information +provided by the server. This can alleviate the performance degradation on the +aggregated global model. Different from existing methods, we focus the +information exchange between clients, which could also enhance the +effectiveness of local training and lead to generate a high-performance global +model. Concretely, we propose a novel FL framework named FedCME by client +matching and classifier exchanging. In FedCME, clients with large differences +in data distribution will be matched in pairs, and then the corresponding pair +of clients will exchange their classifiers at the stage of local training in an +intermediate moment. Since the local data determines the local model training +direction, our method can correct update direction of classifiers and +effectively alleviate local update divergence. Besides, we propose feature +alignment to enhance the training of the feature extractor. Experimental +results demonstrate that FedCME performs better than FedAvg, FedProx, MOON and +FedRS on popular federated learning benchmarks including FMNIST and CIFAR10, in +the case where data are heterogeneous. + +
+
+
+
+
+ + ☆ Revisiting the Robustness of the Minimum Error Entropy Criterion: A + Transfer Learning Case Study + + +
+ Coping with distributional shifts is an important part of transfer learning +methods in order to perform well in real-life tasks. However, most of the +existing approaches in this area either focus on an ideal scenario in which the +data does not contain noises or employ a complicated training paradigm or model +design to deal with distributional shifts. In this paper, we revisit the +robustness of the minimum error entropy (MEE) criterion, a widely used +objective in statistical signal processing to deal with non-Gaussian noises, +and investigate its feasibility and usefulness in real-life transfer learning +regression tasks, where distributional shifts are common. Specifically, we put +forward a new theoretical result showing the robustness of MEE against +covariate shift. We also show that by simply replacing the mean squared error +(MSE) loss with the MEE on basic transfer learning algorithms such as +fine-tuning and linear probing, we can achieve competitive performance with +respect to state-of-the-art transfer learning algorithms. We justify our +arguments on both synthetic data and 5 real-world time-series data. + +
+
+
+
+
+ + ☆ Deep Learning with Passive Optical Nonlinear Mapping + + +
+ Deep learning has fundamentally transformed artificial intelligence, but the +ever-increasing complexity in deep learning models calls for specialized +hardware accelerators. Optical accelerators can potentially offer enhanced +performance, scalability, and energy efficiency. However, achieving nonlinear +mapping, a critical component of neural networks, remains challenging +optically. Here, we introduce a design that leverages multiple scattering in a +reverberating cavity to passively induce optical nonlinear random mapping, +without the need for additional laser power. A key advantage emerging from our +work is that we show we can perform optical data compression, facilitated by +multiple scattering in the cavity, to efficiently compress and retain vital +information while also decreasing data dimensionality. This allows rapid +optical information processing and generation of low dimensional mixtures of +highly nonlinear features. These are particularly useful for applications +demanding high-speed analysis and responses such as in edge computing devices. +Utilizing rapid optical information processing capabilities, our optical +platforms could potentially offer more efficient and real-time processing +solutions for a broad range of applications. We demonstrate the efficacy of our +design in improving computational performance across tasks, including +classification, image reconstruction, key-point detection, and object +detection, all achieved through optical data compression combined with a +digital decoder. Notably, we observed high performance, at an extreme +compression ratio, for real-time pedestrian detection. Our findings pave the +way for novel algorithms and architectural designs for optical computing. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ Machine-Learning-based Colorectal Tissue Classification via Acoustic + Resolution Photoacoustic Microscopy + + +
+ Colorectal cancer is a deadly disease that has become increasingly prevalent +in recent years. Early detection is crucial for saving lives, but traditional +diagnostic methods such as colonoscopy and biopsy have limitations. Colonoscopy +cannot provide detailed information within the tissues affected by cancer, +while biopsy involves tissue removal, which can be painful and invasive. In +order to improve diagnostic efficiency and reduce patient suffering, we studied +machine-learningbased approach for colorectal tissue classification that uses +acoustic resolution photoacoustic microscopy (ARPAM). With this tool, we were +able to classify benign and malignant tissue using multiple machine learning +methods. Our results were analyzed both quantitatively and qualitatively to +evaluate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Multi-class point cloud completion networks for 3D cardiac anatomy + reconstruction from cine magnetic resonance images + + +
+ Cine magnetic resonance imaging (MRI) is the current gold standard for the +assessment of cardiac anatomy and function. However, it typically only acquires +a set of two-dimensional (2D) slices of the underlying three-dimensional (3D) +anatomy of the heart, thus limiting the understanding and analysis of both +healthy and pathological cardiac morphology and physiology. In this paper, we +propose a novel fully automatic surface reconstruction pipeline capable of +reconstructing multi-class 3D cardiac anatomy meshes from raw cine MRI +acquisitions. Its key component is a multi-class point cloud completion network +(PCCN) capable of correcting both the sparsity and misalignment issues of the +3D reconstruction task in a unified model. We first evaluate the PCCN on a +large synthetic dataset of biventricular anatomies and observe Chamfer +distances between reconstructed and gold standard anatomies below or similar to +the underlying image resolution for multiple levels of slice misalignment. +Furthermore, we find a reduction in reconstruction error compared to a +benchmark 3D U-Net by 32% and 24% in terms of Hausdorff distance and mean +surface distance, respectively. We then apply the PCCN as part of our automated +reconstruction pipeline to 1000 subjects from the UK Biobank study in a +cross-domain transfer setting and demonstrate its ability to reconstruct +accurate and topologically plausible biventricular heart meshes with clinical +metrics comparable to the previous literature. Finally, we investigate the +robustness of our proposed approach and observe its capacity to successfully +handle multiple common outlier conditions. + +
+
+
+
+
+ + ☆ Nonlinear Processing with Linear Optics + + +
+ Deep neural networks have achieved remarkable breakthroughs by leveraging +multiple layers of data processing to extract hidden representations, albeit at +the cost of large electronic computing power. To enhance energy efficiency and +speed, the optical implementation of neural networks aims to harness the +advantages of optical bandwidth and the energy efficiency of optical +interconnections. In the absence of low-power optical nonlinearities, the +challenge in the implementation of multilayer optical networks lies in +realizing multiple optical layers without resorting to electronic components. +In this study, we present a novel framework that uses multiple scattering that +is capable of synthesizing programmable linear and nonlinear transformations +concurrently at low optical power by leveraging the nonlinear relationship +between the scattering potential, represented by data, and the scattered field. +Theoretical and experimental investigations show that repeating the data by +multiple scattering enables non-linear optical computing at low power +continuous wave light. + +
+
+ comment: 20 pages, 9 figures and 1 table +
+
+
+
+
+ + ☆ LuckyMera: a Modular AI Framework for Building Hybrid NetHack Agents + + +
+ In the last few decades we have witnessed a significant development in +Artificial Intelligence (AI) thanks to the availability of a variety of +testbeds, mostly based on simulated environments and video games. Among those, +roguelike games offer a very good trade-off in terms of complexity of the +environment and computational costs, which makes them perfectly suited to test +AI agents generalization capabilities. In this work, we present LuckyMera, a +flexible, modular, extensible and configurable AI framework built around +NetHack, a popular terminal-based, single-player roguelike video game. This +library is aimed at simplifying and speeding up the development of AI agents +capable of successfully playing the game and offering a high-level interface +for designing game strategies. LuckyMera comes with a set of off-the-shelf +symbolic and neural modules (called "skills"): these modules can be either +hard-coded behaviors, or neural Reinforcement Learning approaches, with the +possibility of creating compositional hybrid solutions. Additionally, LuckyMera +comes with a set of utility features to save its experiences in the form of +trajectories for further analysis and to use them as datasets to train neural +modules, with a direct interface to the NetHack Learning Environment and +MiniHack. Through an empirical evaluation we validate our skills implementation +and propose a strong baseline agent that can reach state-of-the-art +performances in the complete NetHack game. LuckyMera is open-source and +available at https://github.com/Pervasive-AI-Lab/LuckyMera. + +
+
+
+
+
+ + ☆ Multi-Domain Learning with Modulation Adapters + + +
+ Deep convolutional networks are ubiquitous in computer vision, due to their +excellent performance across different tasks for various domains. Models are, +however, often trained in isolation for each task, failing to exploit +relatedness between tasks and domains to learn more compact models that +generalise better in low-data regimes. Multi-domain learning aims to handle +related tasks, such as image classification across multiple domains, +simultaneously. Previous work on this problem explored the use of a pre-trained +and fixed domain-agnostic base network, in combination with smaller learnable +domain-specific adaptation modules. In this paper, we introduce Modulation +Adapters, which update the convolutional filter weights of the model in a +multiplicative manner for each task. Parameterising these adaptation weights in +a factored manner allows us to scale the number of per-task parameters in a +flexible manner, and to strike different parameter-accuracy trade-offs. We +evaluate our approach on the Visual Decathlon challenge, composed of ten image +classification tasks across different domains, and on the ImageNet-to-Sketch +benchmark, which consists of six image classification tasks. Our approach +yields excellent results, with accuracies that are comparable to or better than +those of existing state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Image Captions are Natural Prompts for Text-to-Image Models + + +
+ With the rapid development of Artificial Intelligence Generated Content +(AIGC), it has become common practice in many learning tasks to train or +fine-tune large models on synthetic data due to the data-scarcity and privacy +leakage problems. Albeit promising with unlimited data generation, owing to +massive and diverse information conveyed in real images, it is challenging for +text-to-image generative models to synthesize informative training data with +hand-crafted prompts, which usually leads to inferior generalization +performance when training downstream models. In this paper, we theoretically +analyze the relationship between the training effect of synthetic data and the +synthetic data distribution induced by prompts. Then we correspondingly propose +a simple yet effective method that prompts text-to-image generative models to +synthesize more informative and diverse training data. Specifically, we caption +each real image with the advanced captioning model to obtain informative and +faithful prompts that extract class-relevant information and clarify the +polysemy of class names. The image captions and class names are concatenated to +prompt generative models for training image synthesis. Extensive experiments on +ImageNette, ImageNet-100, and ImageNet-1K verify that our method significantly +improves the performance of models trained on synthetic training data, i.e., +10% classification accuracy improvements on average. + +
+
+ comment: 20 pages, 1 figure, 10 tables +
+
+
+
+
+ + ☆ Results on Counterfactual Invariance ICML 2023 + + +
+ In this paper we provide a theoretical analysis of counterfactual invariance. +We present a variety of existing definitions, study how they relate to each +other and what their graphical implications are. We then turn to the current +major question surrounding counterfactual invariance, how does it relate to +conditional independence? We show that whilst counterfactual invariance implies +conditional independence, conditional independence does not give any +implications about the degree or likelihood of satisfying counterfactual +invariance. Furthermore, we show that for discrete causal models +counterfactually invariant functions are often constrained to be functions of +particular variables, or even constant. + +
+
+ comment: 5 pages with 6 pages of supplementary. Accepted at the ICML 2023 + workshop on Spurious Correlations, Invariance and Stability +
+
+
+
+
+ + ☆ Kernel-Based Testing for Single-Cell Differential Analysis + + +
+ Single-cell technologies have provided valuable insights into the +distribution of molecular features, such as gene expression and epigenomic +modifications. However, comparing these complex distributions in a controlled +and powerful manner poses methodological challenges. Here we propose to benefit +from the kernel-testing framework to compare the complex cell-wise +distributions of molecular features in a non-linear manner based on their +kernel embedding. Our framework not only allows for feature-wise analyses but +also enables global comparisons of transcriptomes or epigenomes, considering +their intricate dependencies. By using a classifier to discriminate cells based +on the variability of their embedding, our method uncovers heterogeneities in +cell populations that would otherwise go undetected. We show that kernel +testing overcomes the limitations of differential analysis methods dedicated to +single-cell. Kernel testing is applied to investigate the reversion process of +differentiating cells, successfully identifying cells in transition between +reversion and differentiation stages. Additionally, we analyze single-cell +ChIP-Seq data and identify a subpopulation of untreated breast cancer cells +that exhibit an epigenomic profile similar to persister cells. + +
+
+
+
+
+ + ☆ Efficient and Accurate Optimal Transport with Mirror Descent and + Conjugate Gradients + + +
+ We design a novel algorithm for optimal transport by drawing from the +entropic optimal transport, mirror descent and conjugate gradients literatures. +Our algorithm is able to compute optimal transport costs with arbitrary +accuracy without running into numerical stability issues. The algorithm is +implemented efficiently on GPUs and is shown empirically to converge more +quickly than traditional algorithms such as Sinkhorn's Algorithm both in terms +of number of iterations and wall-clock time in many cases. We pay particular +attention to the entropy of marginal distributions and show that high entropy +marginals make for harder optimal transport problems, for which our algorithm +is a good fit. We provide a careful ablation analysis with respect to algorithm +and problem parameters, and present benchmarking over the MNIST dataset. The +results suggest that our algorithm can be a useful addition to the +practitioner's optimal transport toolkit. Our code is open-sourced at +https://github.com/adaptive-agents-lab/MDOT-PNCG . + +
+
+
+
+
+ + ☆ Does Visual Pretraining Help End-to-End Reasoning? + + +
+ We aim to investigate whether end-to-end learning of visual reasoning can be +achieved with general-purpose neural networks, with the help of visual +pretraining. A positive result would refute the common belief that explicit +visual abstraction (e.g. object detection) is essential for compositional +generalization on visual reasoning, and confirm the feasibility of a neural +network "generalist" to solve visual recognition and reasoning tasks. We +propose a simple and general self-supervised framework which "compresses" each +video frame into a small set of tokens with a transformer network, and +reconstructs the remaining frames based on the compressed temporal context. To +minimize the reconstruction loss, the network must learn a compact +representation for each image, as well as capture temporal dynamics and object +permanence from temporal context. We perform evaluation on two visual reasoning +benchmarks, CATER and ACRE. We observe that pretraining is essential to achieve +compositional generalization for end-to-end visual reasoning. Our proposed +framework outperforms traditional supervised pretraining, including image +classification and explicit object detection, by large margins. + +
+
+
+
+
+ + ☆ Can We Trust Race Prediction? + + +
+ In the absence of sensitive race and ethnicity data, researchers, regulators, +and firms alike turn to proxies. In this paper, I train a Bidirectional Long +Short-Term Memory (BiLSTM) model on a novel dataset of voter registration data +from all 50 US states and create an ensemble that achieves up to 36.8% higher +out of sample (OOS) F1 scores than the best performing machine learning models +in the literature. Additionally, I construct the most comprehensive database of +first and surname distributions in the US in order to improve the coverage and +accuracy of Bayesian Improved Surname Geocoding (BISG) and Bayesian Improved +Firstname Surname Geocoding (BIFSG). Finally, I provide the first high-quality +benchmark dataset in order to fairly compare existing models and aid future +model developers. + +
+
+
+
+
+ + ☆ Fairness in KI-Systemen + + +
+ The more AI-assisted decisions affect people's lives, the more important the +fairness of such decisions becomes. In this chapter, we provide an introduction +to research on fairness in machine learning. We explain the main fairness +definitions and strategies for achieving fairness using concrete examples and +place fairness research in the European context. Our contribution is aimed at +an interdisciplinary audience and therefore avoids mathematical formulation but +emphasizes visualizations and examples. + -- + Je mehr KI-gest\"utzte Entscheidungen das Leben von Menschen betreffen, desto +wichtiger ist die Fairness solcher Entscheidungen. In diesem Kapitel geben wir +eine Einf\"uhrung in die Forschung zu Fairness im maschinellen Lernen. Wir +erkl\"aren die wesentlichen Fairness-Definitionen und Strategien zur Erreichung +von Fairness anhand konkreter Beispiele und ordnen die Fairness-Forschung in +den europ\"aischen Kontext ein. Unser Beitrag richtet sich dabei an ein +interdisziplin\"ares Publikum und verzichtet daher auf die mathematische +Formulierung sondern betont Visualisierungen und Beispiele. + +
+
+ comment: in German language +
+
+
+
+
+ + ☆ Cross Feature Selection to Eliminate Spurious Interactions and Single + Feature Dominance Explainable Boosting Machines + + +
+ Interpretability is a crucial aspect of machine learning models that enables +humans to understand and trust the decision-making process of these models. In +many real-world applications, the interpretability of models is essential for +legal, ethical, and practical reasons. For instance, in the banking domain, +interpretability is critical for lenders and borrowers to understand the +reasoning behind the acceptance or rejection of loan applications as per fair +lending laws. However, achieving interpretability in machine learning models is +challenging, especially for complex high-performance models. Hence Explainable +Boosting Machines (EBMs) have been gaining popularity due to their +interpretable and high-performance nature in various prediction tasks. However, +these models can suffer from issues such as spurious interactions with +redundant features and single-feature dominance across all interactions, which +can affect the interpretability and reliability of the model's predictions. In +this paper, we explore novel approaches to address these issues by utilizing +alternate Cross-feature selection, ensemble features and model configuration +alteration techniques. Our approach involves a multi-step feature selection +procedure that selects a set of candidate features, ensemble features and then +benchmark the same using the EBM model. We evaluate our method on three +benchmark datasets and show that the alternate techniques outperform vanilla +EBM methods, while providing better interpretability and feature selection +stability, and improving the model's predictive performance. Moreover, we show +that our approach can identify meaningful interactions and reduce the dominance +of single features in the model's predictions, leading to more reliable and +interpretable models. + Index Terms- Interpretability, EBM's, ensemble, feature selection. + +
+
+
+
+
+ + ☆ A Fast Task Offloading Optimization Framework for IRS-Assisted + Multi-Access Edge Computing System + + +
+ Terahertz communication networks and intelligent reflecting surfaces exhibit +significant potential in advancing wireless networks, particularly within the +domain of aerial-based multi-access edge computing systems. These technologies +enable efficient offloading of computational tasks from user electronic devices +to Unmanned Aerial Vehicles or local execution. For the generation of +high-quality task-offloading allocations, conventional numerical optimization +methods often struggle to solve challenging combinatorial optimization problems +within the limited channel coherence time, thereby failing to respond quickly +to dynamic changes in system conditions. To address this challenge, we propose +a deep learning-based optimization framework called Iterative Order-Preserving +policy Optimization (IOPO), which enables the generation of energy-efficient +task-offloading decisions within milliseconds. Unlike exhaustive search +methods, IOPO provides continuous updates to the offloading decisions without +resorting to exhaustive search, resulting in accelerated convergence and +reduced computational complexity, particularly when dealing with complex +problems characterized by extensive solution spaces. Experimental results +demonstrate that the proposed framework can generate energy-efficient +task-offloading decisions within a very short time period, outperforming other +benchmark methods. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Classification of UHF Partial Discharge Signals in Gas-Insulated HVDC + Systems Using Neural Networks + + +
+ Undetected partial discharges (PDs) are a safety critical issue in high +voltage (HV) gas insulated systems (GIS). While the diagnosis of PDs under AC +voltage is well-established, the analysis of PDs under DC voltage remains an +active research field. A key focus of these investigations is the +classification of different PD sources to enable subsequent sophisticated +analysis. + In this paper, we propose and analyze a neural network-based approach for +classifying PD signals caused by metallic protrusions and conductive particles +on the insulator of HVDC GIS, without relying on pulse sequence analysis +features. In contrast to previous approaches, our proposed model can +discriminate the studied PD signals obtained at negative and positive +potentials, while also generalizing to unseen operating voltage multiples. +Additionally, we compare the performance of time- and frequency-domain input +signals and explore the impact of different normalization schemes to mitigate +the influence of free-space path loss between the sensor and defect location. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ SBMLtoODEjax: efficient simulation and optimization of ODE SBML models + in JAX + + +
+ Developing methods to explore, predict and control the dynamic behavior of +biological systems, from protein pathways to complex cellular processes, is an +essential frontier of research for bioengineering and biomedicine. Thus, +significant effort has gone in computational inference and mathematical +modeling of biological systems. This effort has resulted in the development of +large collections of publicly-available models, typically stored and exchanged +on online platforms (such as the BioModels Database) using the Systems Biology +Markup Language (SBML), a standard format for representing mathematical models +of biological systems. SBMLtoODEjax is a lightweight library that allows to +automatically parse and convert SBML models into python models written +end-to-end in JAX, a high-performance numerical computing library with +automatic differentiation capabilities. SBMLtoODEjax is targeted at researchers +that aim to incorporate SBML-specified ordinary differential equation (ODE) +models into their python projects and machine learning pipelines, in order to +perform efficient numerical simulation and optimization with only a few lines +of code. SBMLtoODEjax is available at +https://github.com/flowersteam/sbmltoodejax. + +
+
+
+
+
+ + ☆ From random-walks to graph-sprints: a low-latency node embedding + framework on continuous-time dynamic graphs + + +
+ Many real-world datasets have an underlying dynamic graph structure, where +entities and their interactions evolve over time. Machine learning models +should consider these dynamics in order to harness their full potential in +downstream tasks. Previous approaches for graph representation learning have +focused on either sampling k-hop neighborhoods, akin to breadth-first search, +or random walks, akin to depth-first search. However, these methods are +computationally expensive and unsuitable for real-time, low-latency inference +on dynamic graphs. To overcome these limitations, we propose graph-sprints a +general purpose feature extraction framework for continuous-time-dynamic-graphs +(CTDGs) that has low latency and is competitive with state-of-the-art, higher +latency models. To achieve this, a streaming, low latency approximation to the +random-walk based features is proposed. In our framework, time-aware node +embeddings summarizing multi-hop information are computed using only single-hop +operations on the incoming edges. We evaluate our proposed approach on three +open-source datasets and two in-house datasets, and compare with three +state-of-the-art algorithms (TGN-attn, TGN-ID, Jodie). We demonstrate that our +graph-sprints features, combined with a machine learning classifier, achieve +competitive performance (outperforming all baselines for the node +classification tasks in five datasets). Simultaneously, graph-sprints +significantly reduce inference latencies, achieving close to an order of +magnitude speed-up in our experimental setting. + +
+
+ comment: 9 pages, 5 figures, 7 tables +
+
+
+
+
+ + ☆ Artificial Intelligence for Science in Quantum, Atomistic, and Continuum + Systems + + +
+ Advances in artificial intelligence (AI) are fueling a new paradigm of +discoveries in natural sciences. Today, AI has started to advance natural +sciences by improving, accelerating, and enabling our understanding of natural +phenomena at a wide range of spatial and temporal scales, giving rise to a new +area of research known as AI for science (AI4Science). Being an emerging +research paradigm, AI4Science is unique in that it is an enormous and highly +interdisciplinary area. Thus, a unified and technical treatment of this field +is needed yet challenging. This paper aims to provide a technically thorough +account of a subarea of AI4Science; namely, AI for quantum, atomistic, and +continuum systems. These areas aim at understanding the physical world from the +subatomic (wavefunctions and electron density), atomic (molecules, proteins, +materials, and interactions), to macro (fluids, climate, and subsurface) scales +and form an important subarea of AI4Science. A unique advantage of focusing on +these areas is that they largely share a common set of challenges, thereby +allowing a unified and foundational treatment. A key common challenge is how to +capture physics first principles, especially symmetries, in natural systems by +deep learning methods. We provide an in-depth yet intuitive account of +techniques to achieve equivariance to symmetry transformations. We also discuss +other common technical challenges, including explainability, +out-of-distribution generalization, knowledge transfer with foundation and +large language models, and uncertainty quantification. To facilitate learning +and education, we provide categorized lists of resources that we found to be +useful. We strive to be thorough and unified and hope this initial effort may +trigger more community interests and efforts to further advance AI4Science. + +
+
+
+
+
+ + ☆ Neurosymbolic AI for Reasoning on Biomedical Knowledge Graphs + + +
+ Biomedical datasets are often modeled as knowledge graphs (KGs) because they +capture the multi-relational, heterogeneous, and dynamic natures of biomedical +systems. KG completion (KGC), can, therefore, help researchers make predictions +to inform tasks like drug repositioning. While previous approaches for KGC were +either rule-based or embedding-based, hybrid approaches based on neurosymbolic +artificial intelligence are becoming more popular. Many of these methods +possess unique characteristics which make them even better suited toward +biomedical challenges. Here, we survey such approaches with an emphasis on +their utilities and prospective benefits for biomedicine. + +
+
+ comment: Proceedings of the $\mathit{40}^{th}$ International Conference on + Machine Learning: Workshop on Knowledge and Logical Reasoning in the Era of + Data-driven Learning (https://klr-icml2023.github.io/schedule.html). PMLR + 202, 2023. Condensed, workshop-ready version of previous survey, + arXiv:2302.07200 , which is under review. 13 pages (9 content, 4 references), + 3 figures, 1 table +
+
+
+
+
+ + ☆ Vocoder drift compensation by x-vector alignment in speaker + anonymisation ISCA + + +
+ For the most popular x-vector-based approaches to speaker anonymisation, the +bulk of the anonymisation can stem from vocoding rather than from the core +anonymisation function which is used to substitute an original speaker x-vector +with that of a fictitious pseudo-speaker. This phenomenon can impede the design +of better anonymisation systems since there is a lack of fine-grained control +over the x-vector space. The work reported in this paper explores the origin of +so-called vocoder drift and shows that it is due to the mismatch between the +substituted x-vector and the original representations of the linguistic +content, intonation and prosody. Also reported is an original approach to +vocoder drift compensation. While anonymisation performance degrades as +expected, compensation reduces vocoder drift substantially, offers improved +control over the x-vector space and lays a foundation for the design of better +anonymisation functions in the future. + +
+
+ comment: Accepted at the ISCA SPSC Symposium 2023 +
+
+
+
+
+ + ☆ On the application of Large Language Models for language teaching and + assessment technology + + +
+ The recent release of very large language models such as PaLM and GPT-4 has +made an unprecedented impact in the popular media and public consciousness, +giving rise to a mixture of excitement and fear as to their capabilities and +potential uses, and shining a light on natural language processing research +which had not previously received so much attention. The developments offer +great promise for education technology, and in this paper we look specifically +at the potential for incorporating large language models in AI-driven language +teaching and assessment systems. We consider several research areas and also +discuss the risks and ethical considerations surrounding generative AI in +education technology for language learners. Overall we find that larger +language models offer improvements over previous models in text generation, +opening up routes toward content generation which had not previously been +plausible. For text generation they must be prompted carefully and their +outputs may need to be reshaped before they are ready for use. For automated +grading and grammatical error correction, tasks whose progress is checked on +well-known benchmarks, early investigations indicate that large language models +on their own do not improve on state-of-the-art results according to standard +evaluation metrics. For grading it appears that linguistic features established +in the literature should still be used for best performance, and for error +correction it may be that the models can offer alternative feedback styles +which are not measured sensitively with existing methods. In all cases, there +is work to be done to experiment with the inclusion of large language models in +education technology for language learners, in order to properly understand and +report on their capacities and limitations, and to ensure that foreseeable +risks such as misinformation and harmful bias are mitigated. + +
+
+ comment: Accepted at the AIED2023 workshop: Empowering Education with LLMs - + the Next-Gen Interface and Content Generation +
+
+
+
+
+ + ☆ Correlation-aware Spatial-Temporal Graph Learning for Multivariate + Time-series Anomaly Detection + + +
+ Multivariate time-series anomaly detection is critically important in many +applications, including retail, transportation, power grid, and water treatment +plants. Existing approaches for this problem mostly employ either statistical +models which cannot capture the non-linear relations well or conventional deep +learning models (e.g., CNN and LSTM) that do not explicitly learn the pairwise +correlations among variables. To overcome these limitations, we propose a novel +method, correlation-aware spatial-temporal graph learning (termed CST-GL), for +time series anomaly detection. CST-GL explicitly captures the pairwise +correlations via a multivariate time series correlation learning module based +on which a spatial-temporal graph neural network (STGNN) can be developed. +Then, by employing a graph convolution network that exploits one- and multi-hop +neighbor information, our STGNN component can encode rich spatial information +from complex pairwise dependencies between variables. With a temporal module +that consists of dilated convolutional functions, the STGNN can further capture +long-range dependence over time. A novel anomaly scoring component is further +integrated into CST-GL to estimate the degree of an anomaly in a purely +unsupervised manner. Experimental results demonstrate that CST-GL can detect +anomalies effectively in general settings as well as enable early detection +across different time delays. + +
+
+ comment: 17 pages, double columns, 10 tables, 3 figures +
+
+
+
+
+ + ☆ Tabular Machine Learning Methods for Predicting Gas Turbine Emissions + + +
+ Predicting emissions for gas turbines is critical for monitoring harmful +pollutants being released into the atmosphere. In this study, we evaluate the +performance of machine learning models for predicting emissions for gas +turbines. We compare an existing predictive emissions model, a first +principles-based Chemical Kinetics model, against two machine learning models +we developed based on SAINT and XGBoost, to demonstrate improved predictive +performance of nitrogen oxides (NOx) and carbon monoxide (CO) using machine +learning techniques. Our analysis utilises a Siemens Energy gas turbine test +bed tabular dataset to train and validate the machine learning models. +Additionally, we explore the trade-off between incorporating more features to +enhance the model complexity, and the resulting presence of increased missing +values in the dataset. + +
+
+ comment: 23 pages, 9 figures, 1 appendix +
+
+
+
+
+ + ☆ Predicting Battery Lifetime Under Varying Usage Conditions from Early + Aging Data + + +
+ Accurate battery lifetime prediction is important for preventative +maintenance, warranties, and improved cell design and manufacturing. However, +manufacturing variability and usage-dependent degradation make life prediction +challenging. Here, we investigate new features derived from capacity-voltage +data in early life to predict the lifetime of cells cycled under widely varying +charge rates, discharge rates, and depths of discharge. Features were extracted +from regularly scheduled reference performance tests (i.e., low rate full +cycles) during cycling. The early-life features capture a cell's state of +health and the rate of change of component-level degradation modes, some of +which correlate strongly with cell lifetime. Using a newly generated dataset +from 225 nickel-manganese-cobalt/graphite Li-ion cells aged under a wide range +of conditions, we demonstrate a lifetime prediction of in-distribution cells +with 15.1% mean absolute percentage error using no more than the first 15% of +data, for most cells. Further testing using a hierarchical Bayesian regression +model shows improved performance on extrapolation, achieving 21.8% mean +absolute percentage error for out-of-distribution cells. Our approach +highlights the importance of using domain knowledge of lithium-ion battery +degradation modes to inform feature engineering. Further, we provide the +community with a new publicly available battery aging dataset with cells cycled +beyond 80% of their rated capacity. + +
+
+
+
+
+ + ☆ Q(D)O-ES: Population-based Quality (Diversity) Optimisation for Post Hoc + Ensemble Selection in AutoML + + +
+ Automated machine learning (AutoML) systems commonly ensemble models post hoc +to improve predictive performance, typically via greedy ensemble selection +(GES). However, we believe that GES may not always be optimal, as it performs a +simple deterministic greedy search. In this work, we introduce two novel +population-based ensemble selection methods, QO-ES and QDO-ES, and compare them +to GES. While QO-ES optimises solely for predictive performance, QDO-ES also +considers the diversity of ensembles within the population, maintaining a +diverse set of well-performing ensembles during optimisation based on ideas of +quality diversity optimisation. The methods are evaluated using 71 +classification datasets from the AutoML benchmark, demonstrating that QO-ES and +QDO-ES often outrank GES, albeit only statistically significant on validation +data. Our results further suggest that diversity can be beneficial for post hoc +ensembling but also increases the risk of overfitting. + +
+
+ comment: 10 pages main paper, 24 pages references and appendix, 4 figures, 16 + subfigures, 13 tables, to be published in: International Conference on + Automated Machine Learning 2023. arXiv admin note: text overlap with + arXiv:2307.00286 +
+
+
+
+
+ + ☆ Universal Online Learning with Gradual Variations: A Multi-layer Online + Ensemble Approach + + +
+ In this paper, we propose an online convex optimization method with two +different levels of adaptivity. On a higher level, our method is agnostic to +the specific type and curvature of the loss functions, while at a lower level, +it can exploit the niceness of the environments and attain problem-dependent +guarantees. To be specific, we obtain $\mathcal{O}(\ln V_T)$, $\mathcal{O}(d +\ln V_T)$ and $\hat{\mathcal{O}}(\sqrt{V_T})$ regret bounds for strongly +convex, exp-concave and convex loss functions, respectively, where $d$ is the +dimension, $V_T$ denotes problem-dependent gradient variations and +$\hat{\mathcal{O}}(\cdot)$-notation omits logarithmic factors on $V_T$. Our +result finds broad implications and applications. It not only safeguards the +worst-case guarantees, but also implies the small-loss bounds in analysis +directly. Besides, it draws deep connections with adversarial/stochastic convex +optimization and game theory, further validating its practical potential. Our +method is based on a multi-layer online ensemble incorporating novel +ingredients, including carefully-designed optimism for unifying diverse +function types and cascaded corrections for algorithmic stability. Remarkably, +despite its multi-layer structure, our algorithm necessitates only one gradient +query per round, making it favorable when the gradient evaluation is +time-consuming. This is facilitated by a novel regret decomposition equipped +with customized surrogate losses. + +
+
+
+
+
+ + ☆ Zero-th Order Algorithm for Softmax Attention Optimization + + +
+ Large language models (LLMs) have brought about significant transformations +in human society. Among the crucial computations in LLMs, the softmax unit +holds great importance. Its helps the model generating a probability +distribution on potential subsequent words or phrases, considering a series of +input words. By utilizing this distribution, the model selects the most +probable next word or phrase, based on the assigned probabilities. The softmax +unit assumes a vital function in LLM training as it facilitates learning from +data through the adjustment of neural network weights and biases. + With the development of the size of LLMs, computing the gradient becomes +expensive. However, Zero-th Order method can approximately compute the gradient +with only forward passes. In this paper, we present a Zero-th Order algorithm +specifically tailored for Softmax optimization. We demonstrate the convergence +of our algorithm, highlighting its effectiveness in efficiently computing +gradients for large-scale LLMs. By leveraging the Zeroth-Order method, our work +contributes to the advancement of optimization techniques in the context of +complex language models. + +
+
+
+
+
+ + ☆ M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models + and Latent Space Geometry Optimization + + +
+ Medical vision-language models enable co-learning and integrating features +from medical imaging and clinical text. However, these models are not easy to +train and the latent representation space can be complex. Here we propose a +novel way for pre-training and regularising medical vision-language models. The +proposed method, named Medical vision-language pre-training with Frozen +language models and Latent spAce Geometry optimization (M-FLAG), leverages a +frozen language model for training stability and efficiency and introduces a +novel orthogonality loss to harmonize the latent space geometry. We demonstrate +the potential of the pre-trained model on three downstream tasks: medical image +classification, segmentation, and object detection. Extensive experiments +across five public datasets demonstrate that M-FLAG significantly outperforms +existing medical vision-language pre-training approaches and reduces the number +of parameters by 78\%. Notably, M-FLAG achieves outstanding performance on the +segmentation task while using only 1\% of the RSNA dataset, even outperforming +ImageNet pre-trained models that have been fine-tuned using 100\% of the data. + +
+
+
+
+
+ + ☆ Gaussian processes for Bayesian inverse problems associated with linear + partial differential equations + + +
+ This work is concerned with the use of Gaussian surrogate models for Bayesian +inverse problems associated with linear partial differential equations. A +particular focus is on the regime where only a small amount of training data is +available. In this regime the type of Gaussian prior used is of critical +importance with respect to how well the surrogate model will perform in terms +of Bayesian inversion. We extend the framework of Raissi et. al. (2017) to +construct PDE-informed Gaussian priors that we then use to construct different +approximate posteriors. A number of different numerical experiments illustrate +the superiority of the PDE-informed Gaussian priors over more traditional +priors. + +
+
+
+
+
+ + ☆ RAYEN: Imposition of Hard Convex Constraints on Neural Networks + + +
+ This paper presents RAYEN, a framework to impose hard convex constraints on +the output or latent variable of a neural network. RAYEN guarantees that, for +any input or any weights of the network, the constraints are satisfied at all +times. Compared to other approaches, RAYEN does not perform a +computationally-expensive orthogonal projection step onto the feasible set, +does not rely on soft constraints (which do not guarantee the satisfaction of +the constraints at test time), does not use conservative approximations of the +feasible set, and does not perform a potentially slow inner gradient descent +correction to enforce the constraints. RAYEN supports any combination of +linear, convex quadratic, second-order cone (SOC), and linear matrix inequality +(LMI) constraints, achieving a very small computational overhead compared to +unconstrained networks. For example, it is able to impose 1K quadratic +constraints on a 1K-dimensional variable with an overhead of less than 8 ms, +and an LMI constraint with 300x300 dense matrices on a 10K-dimensional variable +in less than 12 ms. When used in neural networks that approximate the solution +of constrained optimization problems, RAYEN achieves computation times between +20 and 7468 times faster than state-of-the-art algorithms, while guaranteeing +the satisfaction of the constraints at all times and obtaining a cost very +close to the optimal one. + +
+
+
+
+
+ + ☆ Analyzing the Impact of Adversarial Examples on Explainable Machine + Learning + + +
+ Adversarial attacks are a type of attack on machine learning models where an +attacker deliberately modifies the inputs to cause the model to make incorrect +predictions. Adversarial attacks can have serious consequences, particularly in +applications such as autonomous vehicles, medical diagnosis, and security +systems. Work on the vulnerability of deep learning models to adversarial +attacks has shown that it is very easy to make samples that make a model +predict things that it doesn't want to. In this work, we analyze the impact of +model interpretability due to adversarial attacks on text classification +problems. We develop an ML-based classification model for text data. Then, we +introduce the adversarial perturbations on the text data to understand the +classification performance after the attack. Subsequently, we analyze and +interpret the model's explainability before and after the attack + +
+
+
+
+
+ + ☆ A Secure Aggregation for Federated Learning on Long-Tailed Data + + +
+ As a distributed learning, Federated Learning (FL) faces two challenges: the +unbalanced distribution of training data among participants, and the model +attack by Byzantine nodes. In this paper, we consider the long-tailed +distribution with the presence of Byzantine nodes in the FL scenario. A novel +two-layer aggregation method is proposed for the rejection of malicious models +and the advisable selection of valuable models containing tail class data +information. We introduce the concept of think tank to leverage the wisdom of +all participants. Preliminary experiments validate that the think tank can make +effective model selections for global aggregation. + +
+
+
+
+
+ + ☆ Airway Label Prediction in Video Bronchoscopy: Capturing Temporal + Dependencies Utilizing Anatomical Knowledge + + +
+ Purpose: Navigation guidance is a key requirement for a multitude of lung +interventions using video bronchoscopy. State-of-the-art solutions focus on +lung biopsies using electromagnetic tracking and intraoperative image +registration w.r.t. preoperative CT scans for guidance. The requirement of +patient-specific CT scans hampers the utilisation of navigation guidance for +other applications such as intensive care units. + Methods: This paper addresses navigation guidance solely incorporating +bronchosopy video data. In contrast to state-of-the-art approaches we entirely +omit the use of electromagnetic tracking and patient-specific CT scans. +Guidance is enabled by means of topological bronchoscope localization w.r.t. an +interpatient airway model. Particularly, we take maximally advantage of +anatomical constraints of airway trees being sequentially traversed. This is +realized by incorporating sequences of CNN-based airway likelihoods into a +Hidden Markov Model. + Results: Our approach is evaluated based on multiple experiments inside a +lung phantom model. With the consideration of temporal context and use of +anatomical knowledge for regularization, we are able to improve the accuracy up +to to 0.98 compared to 0.81 (weighted F1: 0.98 compared to 0.81) for a +classification based on individual frames. + Conclusion: We combine CNN-based single image classification of airway +segments with anatomical constraints and temporal HMM-based inference for the +first time. Our approach renders vision-only guidance for bronchoscopy +interventions in the absence of electromagnetic tracking and patient-specific +CT scans possible. + +
+
+ comment: Submitted to International Journal of Computer Assisted Radiology and + Surgery +
+
+
+
+
+ + ☆ GBT: Two-stage transformer framework for non-stationary time series + forecasting + + +
+ This paper shows that time series forecasting Transformer (TSFT) suffers from +severe over-fitting problem caused by improper initialization method of unknown +decoder inputs, esp. when handling non-stationary time series. Based on this +observation, we propose GBT, a novel two-stage Transformer framework with Good +Beginning. It decouples the prediction process of TSFT into two stages, +including Auto-Regression stage and Self-Regression stage to tackle the problem +of different statistical properties between input and prediction +sequences.Prediction results of Auto-Regression stage serve as a Good +Beginning, i.e., a better initialization for inputs of Self-Regression stage. +We also propose Error Score Modification module to further enhance the +forecasting capability of the Self-Regression stage in GBT. Extensive +experiments on seven benchmark datasets demonstrate that GBT outperforms SOTA +TSFTs (FEDformer, Pyraformer, ETSformer, etc.) and many other forecasting +models (SCINet, N-HiTS, etc.) with only canonical attention and convolution +while owning less time and space complexity. It is also general enough to +couple with these models to strengthen their forecasting capability. The source +code is available at: https://github.com/OrigamiSL/GBT + +
+
+ comment: Accepted by Neural Networks +
+
+
+
+
+ + ☆ Systematic Testing of the Data-Poisoning Robustness of KNN + + +
+ Data poisoning aims to compromise a machine learning based software component +by contaminating its training set to change its prediction results for test +inputs. Existing methods for deciding data-poisoning robustness have either +poor accuracy or long running time and, more importantly, they can only certify +some of the truly-robust cases, but remain inconclusive when certification +fails. In other words, they cannot falsify the truly-non-robust cases. To +overcome this limitation, we propose a systematic testing based method, which +can falsify as well as certify data-poisoning robustness for a widely used +supervised-learning technique named k-nearest neighbors (KNN). Our method is +faster and more accurate than the baseline enumeration method, due to a novel +over-approximate analysis in the abstract domain, to quickly narrow down the +search space, and systematic testing in the concrete domain, to find the actual +violations. We have evaluated our method on a set of supervised-learning +datasets. Our results show that the method significantly outperforms +state-of-the-art techniques, and can decide data-poisoning robustness of KNN +prediction results for most of the test inputs. + +
+
+
+
+
+ + ☆ Going Beyond Linear Mode Connectivity: The Layerwise Linear Feature + Connectivity + + +
+ Recent work has revealed many intriguing empirical phenomena in neural +network training, despite the poorly understood and highly complex loss +landscapes and training dynamics. One of these phenomena, Linear Mode +Connectivity (LMC), has gained considerable attention due to the intriguing +observation that different solutions can be connected by a linear path in the +parameter space while maintaining near-constant training and test losses. In +this work, we introduce a stronger notion of linear connectivity, Layerwise +Linear Feature Connectivity (LLFC), which says that the feature maps of every +layer in different trained networks are also linearly connected. We provide +comprehensive empirical evidence for LLFC across a wide range of settings, +demonstrating that whenever two trained networks satisfy LMC (via either +spawning or permutation methods), they also satisfy LLFC in nearly all the +layers. Furthermore, we delve deeper into the underlying factors contributing +to LLFC, which reveal new insights into the spawning and permutation +approaches. The study of LLFC transcends and advances our understanding of LMC +by adopting a feature-learning perspective. + +
+
+ comment: 25 pages, 23 figures +
+
+
+
+
+ + ☆ Complexity Matters: Rethinking the Latent Space for Generative Modeling + + +
+ In generative modeling, numerous successful approaches leverage a +low-dimensional latent space, e.g., Stable Diffusion models the latent space +induced by an encoder and generates images through a paired decoder. Although +the selection of the latent space is empirically pivotal, determining the +optimal choice and the process of identifying it remain unclear. In this study, +we aim to shed light on this under-explored topic by rethinking the latent +space from the perspective of model complexity. Our investigation starts with +the classic generative adversarial networks (GANs). Inspired by the GAN +training objective, we propose a novel "distance" between the latent and data +distributions, whose minimization coincides with that of the generator +complexity. The minimizer of this distance is characterized as the optimal +data-dependent latent that most effectively capitalizes on the generator's +capacity. Then, we consider parameterizing such a latent distribution by an +encoder network and propose a two-stage training strategy called Decoupled +Autoencoder (DAE), where the encoder is only updated in the first stage with an +auxiliary decoder and then frozen in the second stage while the actual decoder +is being trained. DAE can improve the latent distribution and as a result, +improve the generative performance. Our theoretical analyses are corroborated +by comprehensive experiments on various models such as VQGAN and Diffusion +Transformer, where our modifications yield significant improvements in sample +quality with decreased model complexity. + +
+
+ comment: TL;DR: This work characterizes the optimal latent distribution for + generative models from the perspective of minimizing model complexity and + proposes a two-stage training scheme that achieves practical improvements on + GAN, VQGAN and DiT +
+
+
+
+
+ + ☆ Adversarial Attacks on Traffic Sign Recognition: A Survey CEC + + +
+ Traffic sign recognition is an essential component of perception in +autonomous vehicles, which is currently performed almost exclusively with deep +neural networks (DNNs). However, DNNs are known to be vulnerable to adversarial +attacks. Several previous works have demonstrated the feasibility of +adversarial attacks on traffic sign recognition models. Traffic signs are +particularly promising for adversarial attack research due to the ease of +performing real-world attacks using printed signs or stickers. In this work, we +survey existing works performing either digital or real-world attacks on +traffic sign detection and classification models. We provide an overview of the +latest advancements and highlight the existing research areas that require +further investigation. + +
+
+ comment: Accepted for publication at ICECCME2023 +
+
+
+
+
+ + ☆ Convex Bi-Level Optimization Problems with Non-smooth Outer Objective + Function + + +
+ In this paper, we propose the Bi-Sub-Gradient (Bi-SG) method, which is a +generalization of the classical sub-gradient method to the setting of convex +bi-level optimization problems. This is a first-order method that is very easy +to implement in the sense that it requires only a computation of the associated +proximal mapping or a sub-gradient of the outer non-smooth objective function, +in addition to a proximal gradient step on the inner optimization problem. We +show, under very mild assumptions, that Bi-SG tackles bi-level optimization +problems and achieves sub-linear rates both in terms of the inner and outer +objective functions. Moreover, if the outer objective function is additionally +strongly convex (still could be non-smooth), the outer rate can be improved to +a linear rate. Last, we prove that the distance of the generated sequence to +the set of optimal solutions of the bi-level problem converges to zero. + +
+
+ comment: Accepted for publication In SIAM journal on Optimization +
+
+
+
+
+ + ☆ A Look into Causal Effects under Entangled Treatment in Graphs: + Investigating the Impact of Contact on MRSA Infection + + +
+ Methicillin-resistant Staphylococcus aureus (MRSA) is a type of bacteria +resistant to certain antibiotics, making it difficult to prevent MRSA +infections. Among decades of efforts to conquer infectious diseases caused by +MRSA, many studies have been proposed to estimate the causal effects of close +contact (treatment) on MRSA infection (outcome) from observational data. In +this problem, the treatment assignment mechanism plays a key role as it +determines the patterns of missing counterfactuals -- the fundamental challenge +of causal effect estimation. Most existing observational studies for causal +effect learning assume that the treatment is assigned individually for each +unit. However, on many occasions, the treatments are pairwisely assigned for +units that are connected in graphs, i.e., the treatments of different units are +entangled. Neglecting the entangled treatments can impede the causal effect +estimation. In this paper, we study the problem of causal effect estimation +with treatment entangled in a graph. Despite a few explorations for entangled +treatments, this problem still remains challenging due to the following +challenges: (1) the entanglement brings difficulties in modeling and leveraging +the unknown treatment assignment mechanism; (2) there may exist hidden +confounders which lead to confounding biases in causal effect estimation; (3) +the observational data is often time-varying. To tackle these challenges, we +propose a novel method NEAT, which explicitly leverages the graph structure to +model the treatment assignment mechanism, and mitigates confounding biases +based on the treatment assignment modeling. We also extend our method into a +dynamic setting to handle time-varying observational data. Experiments on both +synthetic datasets and a real-world MRSA dataset validate the effectiveness of +the proposed method, and provide insights for future applications. + +
+
+
+
+
+ + ☆ HeroLT: Benchmarking Heterogeneous Long-Tailed Learning + + +
+ Long-tailed data distributions are prevalent in a variety of domains, +including finance, e-commerce, biomedical science, and cyber security. In such +scenarios, the performance of machine learning models is often dominated by the +head categories, while the learning of tail categories is significantly +inadequate. Given abundant studies conducted to alleviate the issue, this work +aims to provide a systematic view of long-tailed learning with regard to three +pivotal angles: (A1) the characterization of data long-tailedness, (A2) the +data complexity of various domains, and (A3) the heterogeneity of emerging +tasks. To achieve this, we develop the most comprehensive (to the best of our +knowledge) long-tailed learning benchmark named HeroLT, which integrates 13 +state-of-the-art algorithms and 6 evaluation metrics on 14 real-world benchmark +datasets across 4 tasks from 3 domains. HeroLT with novel angles and extensive +experiments (264 in total) enables researchers and practitioners to effectively +and fairly evaluate newly proposed methods compared with existing baselines on +varying types of datasets. Finally, we conclude by highlighting the significant +applications of long-tailed learning and identifying several promising future +directions. For accessibility and reproducibility, we open-source our benchmark +HeroLT and corresponding results at https://github.com/SSSKJ/HeroLT. + +
+
+
+
+
+ + ☆ Learning for Counterfactual Fairness from Observational Data + + +
+ Fairness-aware machine learning has attracted a surge of attention in many +domains, such as online advertising, personalized recommendation, and social +media analysis in web applications. Fairness-aware machine learning aims to +eliminate biases of learning models against certain subgroups described by +certain protected (sensitive) attributes such as race, gender, and age. Among +many existing fairness notions, counterfactual fairness is a popular notion +defined from a causal perspective. It measures the fairness of a predictor by +comparing the prediction of each individual in the original world and that in +the counterfactual worlds in which the value of the sensitive attribute is +modified. A prerequisite for existing methods to achieve counterfactual +fairness is the prior human knowledge of the causal model for the data. +However, in real-world scenarios, the underlying causal model is often unknown, +and acquiring such human knowledge could be very difficult. In these scenarios, +it is risky to directly trust the causal models obtained from information +sources with unknown reliability and even causal discovery methods, as +incorrect causal models can consequently bring biases to the predictor and lead +to unfair predictions. In this work, we address the problem of counterfactually +fair prediction from observational data without given causal models by +proposing a novel framework CLAIRE. Specifically, under certain general +assumptions, CLAIRE effectively mitigates the biases from the sensitive +attribute with a representation learning framework based on counterfactual data +augmentation and an invariant penalty. Experiments conducted on both synthetic +and real-world datasets validate the superiority of CLAIRE in both +counterfactual fairness and prediction performance. + +
+
+
+
+
+ + ☆ Can Euclidean Symmetry be Leveraged in Reinforcement Learning and + Planning? + + +
+ In robotic tasks, changes in reference frames typically do not influence the +underlying physical properties of the system, which has been known as +invariance of physical laws.These changes, which preserve distance, encompass +isometric transformations such as translations, rotations, and reflections, +collectively known as the Euclidean group. In this work, we delve into the +design of improved learning algorithms for reinforcement learning and planning +tasks that possess Euclidean group symmetry. We put forth a theory on that +unify prior work on discrete and continuous symmetry in reinforcement learning, +planning, and optimal control. Algorithm side, we further extend the 2D path +planning with value-based planning to continuous MDPs and propose a pipeline +for constructing equivariant sampling-based planning algorithms. Our work is +substantiated with empirical evidence and illustrated through examples that +explain the benefits of equivariance to Euclidean symmetry in tackling natural +control problems. + +
+
+ comment: Preprint. Website: http://lfzhao.com/SymCtrl +
+
+
+
+
+ + ☆ A Lightweight Framework for High-Quality Code Generation + + +
+ In recent years, the use of automated source code generation utilizing +transformer-based generative models has expanded, and these models can generate +functional code according to the requirements of the developers. However, +recent research revealed that these automatically generated source codes can +contain vulnerabilities and other quality issues. Despite researchers' and +practitioners' attempts to enhance code generation models, retraining and +fine-tuning large language models is time-consuming and resource-intensive. +Thus, we describe FRANC, a lightweight framework for recommending more secure +and high-quality source code derived from transformer-based code generation +models. FRANC includes a static filter to make the generated code compilable +with heuristics and a quality-aware ranker to sort the code snippets based on a +quality score. Moreover, the framework uses prompt engineering to fix +persistent quality issues. We evaluated the framework with five Python and Java +code generation models and six prompt datasets, including a newly created one +in this work (SOEval). The static filter improves 9% to 46% Java suggestions +and 10% to 43% Python suggestions regarding compilability. The average +improvement over the NDCG@10 score for the ranking system is 0.0763, and the +repairing techniques repair the highest 80% of prompts. FRANC takes, on +average, 1.98 seconds for Java; for Python, it takes 0.08 seconds. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Forward Laplacian: A New Computational Framework for Neural + Network-based Variational Monte Carlo + + +
+ Neural network-based variational Monte Carlo (NN-VMC) has emerged as a +promising cutting-edge technique of ab initio quantum chemistry. However, the +high computational cost of existing approaches hinders their applications in +realistic chemistry problems. Here, we report the development of a new NN-VMC +method that achieves a remarkable speed-up by more than one order of magnitude, +thereby greatly extending the applicability of NN-VMC to larger systems. Our +key design is a novel computational framework named Forward Laplacian, which +computes the Laplacian associated with neural networks, the bottleneck of +NN-VMC, through an efficient forward propagation process. We then demonstrate +that Forward Laplacian is not only versatile but also facilitates more +developments of acceleration methods across various aspects, including +optimization for sparse derivative matrix and efficient neural network design. +Empirically, our approach enables NN-VMC to investigate a broader range of +atoms, molecules and chemical reactions for the first time, providing valuable +references to other ab initio methods. The results demonstrate a great +potential in applying deep learning methods to solve general quantum mechanical +problems. + +
+
+
+
+
+ + ☆ Towards Stealthy Backdoor Attacks against Speech Recognition via + Elements of Sound + + +
+ Deep neural networks (DNNs) have been widely and successfully adopted and +deployed in various applications of speech recognition. Recently, a few works +revealed that these models are vulnerable to backdoor attacks, where the +adversaries can implant malicious prediction behaviors into victim models by +poisoning their training process. In this paper, we revisit poison-only +backdoor attacks against speech recognition. We reveal that existing methods +are not stealthy since their trigger patterns are perceptible to humans or +machine detection. This limitation is mostly because their trigger patterns are +simple noises or separable and distinctive clips. Motivated by these findings, +we propose to exploit elements of sound ($e.g.$, pitch and timbre) to design +more stealthy yet effective poison-only backdoor attacks. Specifically, we +insert a short-duration high-pitched signal as the trigger and increase the +pitch of remaining audio clips to `mask' it for designing stealthy pitch-based +triggers. We manipulate timbre features of victim audios to design the stealthy +timbre-based attack and design a voiceprint selection module to facilitate the +multi-backdoor attack. Our attacks can generate more `natural' poisoned samples +and therefore are more stealthy. Extensive experiments are conducted on +benchmark datasets, which verify the effectiveness of our attacks under +different settings ($e.g.$, all-to-one, all-to-all, clean-label, physical, and +multi-backdoor settings) and their stealthiness. The code for reproducing main +experiments are available at \url{https://github.com/HanboCai/BadSpeech_SoE}. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ A Quantum Convolutional Neural Network Approach for Object Detection and + Classification + + +
+ This paper presents a comprehensive evaluation of the potential of Quantum +Convolutional Neural Networks (QCNNs) in comparison to classical Convolutional +Neural Networks (CNNs) and Artificial / Classical Neural Network (ANN) models. +With the increasing amount of data, utilizing computing methods like CNN in +real-time has become challenging. QCNNs overcome this challenge by utilizing +qubits to represent data in a quantum environment and applying CNN structures +to quantum computers. The time and accuracy of QCNNs are compared with +classical CNNs and ANN models under different conditions such as batch size and +input size. The maximum complexity level that QCNNs can handle in terms of +these parameters is also investigated. The analysis shows that QCNNs have the +potential to outperform both classical CNNs and ANN models in terms of accuracy +and efficiency for certain applications, demonstrating their promise as a +powerful tool in the field of machine learning. + +
+
+
+
+
+ + ☆ HOPE: High-order Polynomial Expansion of Black-box Neural Networks + + +
+ Despite their remarkable performance, deep neural networks remain mostly +``black boxes'', suggesting inexplicability and hindering their wide +applications in fields requiring making rational decisions. Here we introduce +HOPE (High-order Polynomial Expansion), a method for expanding a network into a +high-order Taylor polynomial on a reference input. Specifically, we derive the +high-order derivative rule for composite functions and extend the rule to +neural networks to obtain their high-order derivatives quickly and accurately. +From these derivatives, we can then derive the Taylor polynomial of the neural +network, which provides an explicit expression of the network's local +interpretations. Numerical analysis confirms the high accuracy, low +computational complexity, and good convergence of the proposed method. +Moreover, we demonstrate HOPE's wide applications built on deep learning, +including function discovery, fast inference, and feature selection. The code +is available at https://github.com/HarryPotterXTX/HOPE.git. + +
+
+
+
+
+ + ☆ Mini-Giants: "Small" Language Models and Open Source Win-Win + + +
+ ChatGPT is phenomenal. However, it is prohibitively expensive to train and +refine such giant models. Fortunately, small language models are flourishing +and becoming more and more competent. We call them "mini-giants". We argue that +open source community like Kaggle and mini-giants will win-win in many ways, +technically, ethically and socially. In this article, we present a brief yet +rich background, discuss how to attain small language models, present a +comparative study of small language models and a brief discussion of evaluation +methods, discuss the application scenarios where small language models are most +needed in the real world, and conclude with discussion and outlook. + +
+
+ comment: 16 pages, 1 figure +
+
+
+
+
+ + ☆ An Empirical Investigation of Pre-trained Model Selection for + Out-of-Distribution Generalization and Calibration + + +
+ In the realm of out-of-distribution generalization tasks, finetuning has +risen as a key strategy. While the most focus has been on optimizing learning +algorithms, our research highlights the influence of pre-trained model +selection in finetuning on out-of-distribution performance and inference +uncertainty. Balancing model size constraints of a single GPU, we examined the +impact of varying pre-trained datasets and model parameters on performance +metrics like accuracy and expected calibration error. Our findings underscore +the significant influence of pre-trained model selection, showing marked +performance improvements over algorithm choice. Larger models outperformed +others, though the balance between memorization and true generalization merits +further investigation. Ultimately, our research emphasizes the importance of +pre-trained model selection for enhancing out-of-distribution generalization. + +
+
+
+
+
+ + ☆ Multi-Objective Optimization of Performance and Interpretability of + Tabular Supervised Machine Learning Models GECCO 2023 + + +
+ We present a model-agnostic framework for jointly optimizing the predictive +performance and interpretability of supervised machine learning models for +tabular data. Interpretability is quantified via three measures: feature +sparsity, interaction sparsity of features, and sparsity of non-monotone +feature effects. By treating hyperparameter optimization of a machine learning +algorithm as a multi-objective optimization problem, our framework allows for +generating diverse models that trade off high performance and ease of +interpretability in a single optimization run. Efficient optimization is +achieved via augmentation of the search space of the learning algorithm by +incorporating feature selection, interaction and monotonicity constraints into +the hyperparameter search space. We demonstrate that the optimization problem +effectively translates to finding the Pareto optimal set of groups of selected +features that are allowed to interact in a model, along with finding their +optimal monotonicity constraints and optimal hyperparameters of the learning +algorithm itself. We then introduce a novel evolutionary algorithm that can +operate efficiently on this augmented search space. In benchmark experiments, +we show that our framework is capable of finding diverse models that are highly +competitive or outperform state-of-the-art XGBoost or Explainable Boosting +Machine models, both with respect to performance and interpretability. + +
+
+ comment: Extended version of the paper accepted at GECCO 2023. 16 pages, 7 + tables, 7 figures +
+
+
+
+
+ + ☆ Basal-Bolus Advisor for Type 1 Diabetes (T1D) Patients Using Multi-Agent + Reinforcement Learning (RL) Methodology + + +
+ This paper presents a novel multi-agent reinforcement learning (RL) approach +for personalized glucose control in individuals with type 1 diabetes (T1D). The +method employs a closed-loop system consisting of a blood glucose (BG) +metabolic model and a multi-agent soft actor-critic RL model acting as the +basal-bolus advisor. Performance evaluation is conducted in three scenarios, +comparing the RL agents to conventional therapy. Evaluation metrics include +glucose levels (minimum, maximum, and mean), time spent in different BG ranges, +and average daily bolus and basal insulin dosages. Results demonstrate that the +RL-based basal-bolus advisor significantly improves glucose control, reducing +glycemic variability and increasing time spent within the target range (70-180 +mg/dL). Hypoglycemia events are effectively prevented, and severe hyperglycemia +events are reduced. The RL approach also leads to a statistically significant +reduction in average daily basal insulin dosage compared to conventional +therapy. These findings highlight the effectiveness of the multi-agent RL +approach in achieving better glucose control and mitigating the risk of severe +hyperglycemia in individuals with T1D. + +
+
+ comment: 8 pages, 2 figures, 1 Table +
+
+
+
+
+ + ☆ Evaluating unsupervised disentangled representation learning for genomic + discovery and disease risk prediction ICML + + +
+ High-dimensional clinical data have become invaluable resources for genetic +studies, due to their accessibility in biobank-scale datasets and the +development of high performance modeling techniques especially using deep +learning. Recent work has shown that low dimensional embeddings of these +clinical data learned by variational autoencoders (VAE) can be used for +genome-wide association studies and polygenic risk prediction. In this work, we +consider multiple unsupervised learning methods for learning disentangled +representations, namely autoencoders, VAE, beta-VAE, and FactorVAE, in the +context of genetic association studies. Using spirograms from UK Biobank as a +running example, we observed improvements in the number of genome-wide +significant loci, heritability, and performance of polygenic risk scores for +asthma and chronic obstructive pulmonary disease by using FactorVAE or +beta-VAE, compared to standard VAE or non-variational autoencoders. FactorVAEs +performed effectively across multiple values of the regularization +hyperparameter, while beta-VAEs were much more sensitive to the hyperparameter +values. + +
+
+ comment: Accepted to the 2023 ICML Workshop on Computational Biology. + Honolulu, Hawaii, USA, 2023 +
+
+
+
+
+ + ☆ The Predicted-Deletion Dynamic Model: Taking Advantage of ML + Predictions, for Free + + +
+ The main bottleneck in designing efficient dynamic algorithms is the unknown +nature of the update sequence. In particular, there are some problems, like +3-vertex connectivity, planar digraph all pairs shortest paths, and others, +where the separation in runtime between the best partially dynamic solutions +and the best fully dynamic solutions is polynomial, sometimes even exponential. + In this paper, we formulate the predicted-deletion dynamic model, motivated +by a recent line of empirical work about predicting edge updates in dynamic +graphs. In this model, edges are inserted and deleted online, and when an edge +is inserted, it is accompanied by a "prediction" of its deletion time. This +models real world settings where services may have access to historical data or +other information about an input and can subsequently use such information make +predictions about user behavior. The model is also of theoretical interest, as +it interpolates between the partially dynamic and fully dynamic settings, and +provides a natural extension of the algorithms with predictions paradigm to the +dynamic setting. + We give a novel framework for this model that "lifts" partially dynamic +algorithms into the fully dynamic setting with little overhead. We use our +framework to obtain improved efficiency bounds over the state-of-the-art +dynamic algorithms for a variety of problems. In particular, we design +algorithms that have amortized update time that scales with a partially dynamic +algorithm, with high probability, when the predictions are of high quality. On +the flip side, our algorithms do no worse than existing fully-dynamic +algorithms when the predictions are of low quality. Furthermore, our algorithms +exhibit a graceful trade-off between the two cases. Thus, we are able to take +advantage of ML predictions asymptotically "for free.'' + +
+
+
+
+
+ + ☆ Examining the Effects of Degree Distribution and Homophily in Graph + Learning Models KDD 2023 + + +
+ Despite a surge in interest in GNN development, homogeneity in benchmarking +datasets still presents a fundamental issue to GNN research. GraphWorld is a +recent solution which uses the Stochastic Block Model (SBM) to generate diverse +populations of synthetic graphs for benchmarking any GNN task. Despite its +success, the SBM imposed fundamental limitations on the kinds of graph +structure GraphWorld could create. + In this work we examine how two additional synthetic graph generators can +improve GraphWorld's evaluation; LFR, a well-established model in the graph +clustering literature and CABAM, a recent adaptation of the Barabasi-Albert +model tailored for GNN benchmarking. By integrating these generators, we +significantly expand the coverage of graph space within the GraphWorld +framework while preserving key graph properties observed in real-world +networks. To demonstrate their effectiveness, we generate 300,000 graphs to +benchmark 11 GNN models on a node classification task. We find GNN performance +variations in response to homophily, degree distribution and feature signal. +Based on these findings, we classify models by their sensitivity to the new +generators under these properties. Additionally, we release the extensions made +to GraphWorld on the GitHub repository, offering further evaluation of GNN +performance on new graphs. + +
+
+ comment: Accepted to Workshop on Graph Learning Benchmarks at KDD 2023 +
+
+
+
+
+ + ☆ Modular Neural Network Approaches for Surgical Image Recognition + + +
+ Deep learning-based applications have seen a lot of success in recent years. +Text, audio, image, and video have all been explored with great success using +deep learning approaches. The use of convolutional neural networks (CNN) in +computer vision, in particular, has yielded reliable results. In order to +achieve these results, a large amount of data is required. However, the dataset +cannot always be accessible. Moreover, annotating data can be difficult and +time-consuming. Self-training is a semi-supervised approach that managed to +alleviate this problem and achieve state-of-the-art performances. Theoretical +analysis even proved that it may result in a better generalization than a +normal classifier. Another problem neural networks can face is the increasing +complexity of modern problems, requiring a high computational and storage cost. +One way to mitigate this issue, a strategy that has been inspired by human +cognition known as modular learning, can be employed. The principle of the +approach is to decompose a complex problem into simpler sub-tasks. This +approach has several advantages, including faster learning, better +generalization, and enables interpretability. + In the first part of this paper, we introduce and evaluate different +architectures of modular learning for Dorsal Capsulo-Scapholunate Septum (DCSS) +instability classification. Our experiments have shown that modular learning +improves performances compared to non-modular systems. Moreover, we found that +weighted modular, that is to weight the output using the probabilities from the +gating module, achieved an almost perfect classification. In the second part, +we present our approach for data labeling and segmentation with self-training +applied on shoulder arthroscopy images. + +
+
+
+
+
+ + ☆ Disentangling Node Attributes from Graph Topology for Improved + Generalizability in Link Prediction + + +
+ Link prediction is a crucial task in graph machine learning with diverse +applications. We explore the interplay between node attributes and graph +topology and demonstrate that incorporating pre-trained node attributes +improves the generalization power of link prediction models. Our proposed +method, UPNA (Unsupervised Pre-training of Node Attributes), solves the +inductive link prediction problem by learning a function that takes a pair of +node attributes and predicts the probability of an edge, as opposed to Graph +Neural Networks (GNN), which can be prone to topological shortcuts in graphs +with power-law degree distribution. In this manner, UPNA learns a significant +part of the latent graph generation mechanism since the learned function can be +used to add incoming nodes to a growing graph. By leveraging pre-trained node +attributes, we overcome observational bias and make meaningful predictions +about unobserved nodes, surpassing state-of-the-art performance (3X to 34X +improvement on benchmark datasets). UPNA can be applied to various pairwise +learning tasks and integrated with existing link prediction models to enhance +their generalizability and bolster graph generative models. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ☆ Natural Actor-Critic for Robust Reinforcement Learning with Function + Approximation + + +
+ We study robust reinforcement learning (RL) with the goal of determining a +well-performing policy that is robust against model mismatch between the +training simulator and the testing environment. Previous policy-based robust RL +algorithms mainly focus on the tabular setting under uncertainty sets that +facilitate robust policy evaluation, but are no longer tractable when the +number of states scales up. To this end, we propose two novel uncertainty set +formulations, one based on double sampling and the other on an integral +probability metric. Both make large-scale robust RL tractable even when one +only has access to a simulator. We propose a robust natural actor-critic (RNAC) +approach that incorporates the new uncertainty sets and employs function +approximation. We provide finite-time convergence guarantees for the proposed +RNAC algorithm to the optimal robust policy within the function approximation +error. Finally, we demonstrate the robust performance of the policy learned by +our proposed RNAC approach in multiple MuJoCo environments and a real-world +TurtleBot navigation task. + +
+
+
+
+
+ + ☆ Latent Space Representations of Neural Algorithmic Reasoners ICML 2023 + + +
+ Neural Algorithmic Reasoning (NAR) is a research area focused on designing +neural architectures that can reliably capture classical computation, usually +by learning to execute algorithms. A typical approach is to rely on Graph +Neural Network (GNN) architectures, which encode inputs in high-dimensional +latent spaces that are repeatedly transformed during the execution of the +algorithm. In this work we perform a detailed analysis of the structure of the +latent space induced by the GNN when executing algorithms. We identify two +possible failure modes: (i) loss of resolution, making it hard to distinguish +similar values; (ii) inability to deal with values outside the range observed +during training. We propose to solve the first issue by relying on a softmax +aggregator, and propose to decay the latent space in order to deal with +out-of-range values. We show that these changes lead to improvements on the +majority of algorithms in the standard CLRS-30 benchmark when using the +state-of-the-art Triplet-GMPNN processor. Our code is available at +\href{https://github.com/mirjanic/nar-latent-spaces}{https://github.com/mirjanic/nar-latent-spaces}. + +
+
+ comment: 18 pages, 17 figures, accepted at KLR Workshop at ICML 2023 +
+
+
+
+
+ + ☆ An Alternative to Variance: Gini Deviation for Risk-averse Policy + Gradient + + +
+ Restricting the variance of a policy's return is a popular choice in +risk-averse Reinforcement Learning (RL) due to its clear mathematical +definition and easy interpretability. Traditional methods directly restrict the +total return variance. Recent methods restrict the per-step reward variance as +a proxy. We thoroughly examine the limitations of these variance-based methods, +such as sensitivity to numerical scale and hindering of policy learning, and +propose to use an alternative risk measure, Gini deviation, as a substitute. We +study various properties of this new risk measure and derive a policy gradient +algorithm to minimize it. Empirical evaluation in domains where risk-aversion +can be clearly defined, shows that our algorithm can mitigate the limitations +of variance-based risk measures and achieves high return with low risk in terms +of variance and Gini deviation when others fail to learn a reasonable policy. + +
+
+
+
+
+ + ☆ Meta-Value Learning: a General Framework for Learning with Learning + Awareness NeurIPS 2023 + + +
+ Gradient-based learning in multi-agent systems is difficult because the +gradient derives from a first-order model which does not account for the +interaction between agents' learning processes. LOLA (arXiv:1709.04326) +accounts for this by differentiating through one step of optimization. We +extend the ideas of LOLA and develop a fully-general value-based approach to +optimization. At the core is a function we call the meta-value, which at each +point in joint-policy space gives for each agent a discounted sum of its +objective over future optimization steps. We argue that the gradient of the +meta-value gives a more reliable improvement direction than the gradient of the +original objective, because the meta-value derives from empirical observations +of the effects of optimization. We show how the meta-value can be approximated +by training a neural network to minimize TD error along optimization +trajectories in which agents follow the gradient of the meta-value. We analyze +the behavior of our method on the Logistic Game and on the Iterated Prisoner's +Dilemma. + +
+
+ comment: Submitted to NeurIPS 2023 +
+
+
+
+
+ + ☆ Curriculum Learning for Graph Neural Networks: A Multiview + Competence-based Approach ACL 2023 + + +
+ A curriculum is a planned sequence of learning materials and an effective one +can make learning efficient and effective for both humans and machines. Recent +studies developed effective data-driven curriculum learning approaches for +training graph neural networks in language applications. However, existing +curriculum learning approaches often employ a single criterion of difficulty in +their training paradigms. In this paper, we propose a new perspective on +curriculum learning by introducing a novel approach that builds on graph +complexity formalisms (as difficulty criteria) and model competence during +training. The model consists of a scheduling scheme which derives effective +curricula by accounting for different views of sample difficulty and model +competence during training. The proposed solution advances existing research in +curriculum learning for graph neural networks with the ability to incorporate a +fine-grained spectrum of graph difficulty criteria in their training paradigms. +Experimental results on real-world link prediction and node classification +tasks illustrate the effectiveness of the proposed approach. + +
+
+ comment: ACL 2023 +
+
+
+
+
+ + ☆ An Admissible Shift-Consistent Method for Recommender Systems + + +
+ In this paper, we propose a new constraint, called shift-consistency, for +solving matrix/tensor completion problems in the context of recommender +systems. Our method provably guarantees several key mathematical properties: +(1) satisfies a recently established admissibility criterion for recommender +systems; (2) satisfies a definition of fairness that eliminates a specific +class of potential opportunities for users to maliciously influence system +recommendations; and (3) offers robustness by exploiting provable uniqueness of +missing-value imputation. We provide a rigorous mathematical description of the +method, including its generalization from matrix to tensor form to permit +representation and exploitation of complex structural relationships among sets +of user and product attributes. We argue that our analysis suggests a +structured means for defining latent-space projections that can permit provable +performance properties to be established for machine learning methods. + +
+
+
+
+
+ + ☆ Autoregressive Diffusion Model for Graph Generation + + +
+ Diffusion-based graph generative models have recently obtained promising +results for graph generation. However, existing diffusion-based graph +generative models are mostly one-shot generative models that apply Gaussian +diffusion in the dequantized adjacency matrix space. Such a strategy can suffer +from difficulty in model training, slow sampling speed, and incapability of +incorporating constraints. We propose an \emph{autoregressive diffusion} model +for graph generation. Unlike existing methods, we define a node-absorbing +diffusion process that operates directly in the discrete graph space. For +forward diffusion, we design a \emph{diffusion ordering network}, which learns +a data-dependent node absorbing ordering from graph topology. For reverse +generation, we design a \emph{denoising network} that uses the reverse node +ordering to efficiently reconstruct the graph by predicting the node type of +the new node and its edges with previously denoised nodes at a time. Based on +the permutation invariance of graph, we show that the two networks can be +jointly trained by optimizing a simple lower bound of data likelihood. Our +experiments on six diverse generic graph datasets and two molecule datasets +show that our model achieves better or comparable generation performance with +previous state-of-the-art, and meanwhile enjoys fast generation speed. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Privacy-preserving patient clustering for personalized federated + learning + + +
+ Federated Learning (FL) is a machine learning framework that enables multiple +organizations to train a model without sharing their data with a central +server. However, it experiences significant performance degradation if the data +is non-identically independently distributed (non-IID). This is a problem in +medical settings, where variations in the patient population contribute +significantly to distribution differences across hospitals. Personalized FL +addresses this issue by accounting for site-specific distribution differences. +Clustered FL, a Personalized FL variant, was used to address this problem by +clustering patients into groups across hospitals and training separate models +on each group. However, privacy concerns remained as a challenge as the +clustering process requires exchange of patient-level information. This was +previously solved by forming clusters using aggregated data, which led to +inaccurate groups and performance degradation. In this study, we propose +Privacy-preserving Community-Based Federated machine Learning (PCBFL), a novel +Clustered FL framework that can cluster patients using patient-level data while +protecting privacy. PCBFL uses Secure Multiparty Computation, a cryptographic +technique, to securely calculate patient-level similarity scores across +hospitals. We then evaluate PCBFL by training a federated mortality prediction +model using 20 sites from the eICU dataset. We compare the performance gain +from PCBFL against traditional and existing Clustered FL frameworks. Our +results show that PCBFL successfully forms clinically meaningful cohorts of +low, medium, and high-risk patients. PCBFL outperforms traditional and existing +Clustered FL frameworks with an average AUC improvement of 4.3% and AUPRC +improvement of 7.8%. + +
+
+
+
+
+ + ♻ ☆ Predicting Grokking Long Before it Happens: A look into the loss + landscape of models which grok + + +
+ This paper focuses on predicting the occurrence of grokking in neural +networks, a phenomenon in which perfect generalization emerges long after signs +of overfitting or memorization are observed. It has been reported that grokking +can only be observed with certain hyper-parameters. This makes it critical to +identify the parameters that lead to grokking. However, since grokking occurs +after a large number of epochs, searching for the hyper-parameters that lead to +it is time-consuming. In this paper, we propose a low-cost method to predict +grokking without training for a large number of epochs. In essence, by studying +the learning curve of the first few epochs, we show that one can predict +whether grokking will occur later on. Specifically, if certain oscillations +occur in the early epochs, one can expect grokking to occur if the model is +trained for a much longer period of time. We propose using the spectral +signature of a learning curve derived by applying the Fourier transform to +quantify the amplitude of low-frequency components to detect the presence of +such oscillations. We also present additional experiments aimed at explaining +the cause of these oscillations and characterizing the loss landscape. + +
+
+ comment: 26 pages, 31 figures +
+
+
+
+
+ + ♻ ☆ Robust empirical risk minimization via Newton's method + + +
+ A new variant of Newton's method for empirical risk minimization is studied, +where at each iteration of the optimization algorithm, the gradient and Hessian +of the objective function are replaced by robust estimators taken from existing +literature on robust mean estimation for multivariate data. After proving a +general theorem about the convergence of successive iterates to a small ball +around the population-level minimizer, consequences of the theory in +generalized linear models are studied when data are generated from Huber's +epsilon-contamination model and/or heavytailed distributions. An algorithm for +obtaining robust Newton directions based on the conjugate gradient method is +also proposed, which may be more appropriate for high-dimensional settings, and +conjectures about the convergence of the resulting algorithm are offered. +Compared to robust gradient descent, the proposed algorithm enjoys the faster +rates of convergence for successive iterates often achieved by second-order +algorithms for convex problems, i.e., quadratic convergence in a neighborhood +of the optimum, with a stepsize that may be chosen adaptively via backtracking +linesearch. + +
+
+
+
+
+ + ♻ ☆ A Two-Stage Active Learning Algorithm for $k$-Nearest Neighbors + + +
+ $k$-nearest neighbor classification is a popular non-parametric method +because of desirable properties like automatic adaption to distributional scale +changes. Unfortunately, it has thus far proved difficult to design active +learning strategies for the training of local voting-based classifiers that +naturally retain these desirable properties, and hence active learning +strategies for $k$-nearest neighbor classification have been conspicuously +missing from the literature. In this work, we introduce a simple and intuitive +active learning algorithm for the training of $k$-nearest neighbor classifiers, +the first in the literature which retains the concept of the $k$-nearest +neighbor vote at prediction time. We provide consistency guarantees for a +modified $k$-nearest neighbors classifier trained on samples acquired via our +scheme, and show that when the conditional probability function +$\mathbb{P}(Y=y|X=x)$ is sufficiently smooth and the Tsybakov noise condition +holds, our actively trained classifiers converge to the Bayes optimal +classifier at a faster asymptotic rate than passively trained $k$-nearest +neighbor classifiers. + +
+
+
+
+
+ + ♻ ☆ LLMs for Semi-Automated Data Science: Introducing CAAFE for + Context-Aware Automated Feature Engineering + + +
+ As the field of automated machine learning (AutoML) advances, it becomes +increasingly important to incorporate domain knowledge into these systems. We +present an approach for doing so by harnessing the power of large language +models (LLMs). Specifically, we introduce Context-Aware Automated Feature +Engineering (CAAFE), a feature engineering method for tabular datasets that +utilizes an LLM to iteratively generate additional semantically meaningful +features for tabular datasets based on the description of the dataset. The +method produces both Python code for creating new features and explanations for +the utility of the generated features. + Despite being methodologically simple, CAAFE improves performance on 11 out +of 14 datasets - boosting mean ROC AUC performance from 0.798 to 0.822 across +all dataset - similar to the improvement achieved by using a random forest +instead of logistic regression on our datasets. + Furthermore, CAAFE is interpretable by providing a textual explanation for +each generated feature. CAAFE paves the way for more extensive semi-automation +in data science tasks and emphasizes the significance of context-aware +solutions that can extend the scope of AutoML systems to semantic AutoML. We +release our $\href{https://github.com/automl/CAAFE}{code}$, a simple +$\href{https://colab.research.google.com/drive/1mCA8xOAJZ4MaB_alZvyARTMjhl6RZf0a}{demo}$ +and a $\href{https://pypi.org/project/caafe/}{python\ package}$. + +
+
+
+
+
+ + ♻ ☆ Provably Faster Gradient Descent via Long Steps + + +
+ This work establishes provably faster convergence rates for gradient descent +via a computer-assisted analysis technique. Our theory allows nonconstant +stepsize policies with frequent long steps potentially violating descent by +analyzing the overall effect of many iterations at once rather than the typical +one-iteration inductions used in most first-order method analyses. We show that +long steps, which may increase the objective value in the short term, lead to +provably faster convergence in the long term. A conjecture towards proving a +faster $O(1/T\log T)$ rate for gradient descent is also motivated along with +simple numerical validation. + +
+
+ comment: 14pages plus references and appendix. Recent updates added more + references +
+
+
+
+
+ + ♻ ☆ Establishing a stronger baseline for lightweight contrastive models ICME 2023 + + +
+ Recent research has reported a performance degradation in self-supervised +contrastive learning for specially designed efficient networks, such as +MobileNet and EfficientNet. A common practice to address this problem is to +introduce a pretrained contrastive teacher model and train the lightweight +networks with distillation signals generated by the teacher. However, it is +time and resource consuming to pretrain a teacher model when it is not +available. In this work, we aim to establish a stronger baseline for +lightweight contrastive models without using a pretrained teacher model. +Specifically, we show that the optimal recipe for efficient models is different +from that of larger models, and using the same training settings as ResNet50, +as previous research does, is inappropriate. Additionally, we observe a common +issu e in contrastive learning where either the positive or negative views can +be noisy, and propose a smoothed version of InfoNCE loss to alleviate this +problem. As a result, we successfully improve the linear evaluation results +from 36.3\% to 62.3\% for MobileNet-V3-Large and from 42.2\% to 65.8\% for +EfficientNet-B0 on ImageNet, closing the accuracy gap to ResNet50 with +$5\times$ fewer parameters. We hope our research will facilitate the usage of +lightweight contrastive models. + +
+
+ comment: ICME 2023 oral +
+
+
+
+
+ + ♻ ☆ Predicting Out-of-Domain Generalization with Neighborhood Invariance + + +
+ Developing and deploying machine learning models safely depends on the +ability to characterize and compare their abilities to generalize to new +environments. Although recent work has proposed a variety of methods that can +directly predict or theoretically bound the generalization capacity of a model, +they rely on strong assumptions such as matching train/test distributions and +access to model gradients. In order to characterize generalization when these +assumptions are not satisfied, we propose neighborhood invariance, a measure of +a classifier's output invariance in a local transformation neighborhood. +Specifically, we sample a set of transformations and given an input test point, +calculate the invariance as the largest fraction of transformed points +classified into the same class. Crucially, our measure is simple to calculate, +does not depend on the test point's true label, makes no assumptions about the +data distribution or model, and can be applied even in out-of-domain (OOD) +settings where existing methods cannot, requiring only selecting a set of +appropriate data transformations. In experiments on robustness benchmarks in +image classification, sentiment analysis, and natural language inference, we +demonstrate a strong and robust correlation between our neighborhood invariance +measure and actual OOD generalization on over 4,600 models evaluated on over +100 unique train/test domain pairs. + +
+
+ comment: 38 pages, 5 figures, 28 tables +
+
+
+
+
+ + ♻ ☆ Knowledge Boosting: Rethinking Medical Contrastive Vision-Language + Pre-Training MICCAI 2023 + + +
+ The foundation models based on pre-training technology have significantly +advanced artificial intelligence from theoretical to practical applications. +These models have facilitated the feasibility of computer-aided diagnosis for +widespread use. Medical contrastive vision-language pre-training, which does +not require human annotations, is an effective approach for guiding +representation learning using description information in diagnostic reports. +However, the effectiveness of pre-training is limited by the large-scale +semantic overlap and shifting problems in medical field. To address these +issues, we propose the Knowledge-Boosting Contrastive Vision-Language +Pre-training framework (KoBo), which integrates clinical knowledge into the +learning of vision-language semantic consistency. The framework uses an +unbiased, open-set sample-wise knowledge representation to measure negative +sample noise and supplement the correspondence between vision-language mutual +information and clinical knowledge. Extensive experiments validate the effect +of our framework on eight tasks including classification, segmentation, +retrieval, and semantic relatedness, achieving comparable or better performance +with the zero-shot or few-shot settings. Our code is open on +https://github.com/ChenXiaoFei-CS/KoBo. + +
+
+ comment: accepted by MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ DataAssist: A Machine Learning Approach to Data Cleaning and Preparation + + +
+ Current automated machine learning (ML) tools are model-centric, focusing on +model selection and parameter optimization. However, the majority of the time +in data analysis is devoted to data cleaning and wrangling, for which limited +tools are available. Here we present DataAssist, an automated data preparation +and cleaning platform that enhances dataset quality using ML-informed methods. +We show that DataAssist provides a pipeline for exploratory data analysis and +data cleaning, including generating visualization for user-selected variables, +unifying data annotation, suggesting anomaly removal, and preprocessing data. +The exported dataset can be readily integrated with other autoML tools or +user-specified model for downstream analysis. Our data-centric tool is +applicable to a variety of fields, including economics, business, and +forecasting applications saving over 50% time of the time spent on data +cleansing and preparation. + +
+
+
+
+
+ + ♻ ☆ Assessment of Reinforcement Learning Algorithms for Nuclear Power Plant + Fuel Optimization + + +
+ The nuclear fuel loading pattern optimization problem belongs to the class of +large-scale combinatorial optimization. It is also characterized by multiple +objectives and constraints, which makes it impossible to solve explicitly. +Stochastic optimization methodologies including Genetic Algorithms and +Simulated Annealing are used by different nuclear utilities and vendors, but +hand-designed solutions continue to be the prevalent method in the industry. To +improve the state-of-the-art, Deep Reinforcement Learning (RL), in particular, +Proximal Policy Optimization is leveraged. This work presents a first-of-a-kind +approach to utilize deep RL to solve the loading pattern problem and could be +leveraged for any engineering design optimization. This paper is also to our +knowledge the first to propose a study of the behavior of several +hyper-parameters that influence the RL algorithm. The algorithm is highly +dependent on multiple factors such as the shape of the objective function +derived for the core design that behaves as a fudge factor that affects the +stability of the learning. But also, an exploration/exploitation trade-off that +manifests through different parameters such as the number of loading patterns +seen by the agents per episode, the number of samples collected before a policy +update nsteps, and an entropy factor ent_coef that increases the randomness of +the policy during training. We found that RL must be applied similarly to a +Gaussian Process in which the acquisition function is replaced by a +parametrized policy. Then, once an initial set of hyper-parameters is found, +reducing nsteps and ent_coef until no more learning is observed will result in +the highest sample efficiency robustly and stably. This resulted in an economic +benefit of 535,000- 642,000 $/year/plant. + +
+
+
+
+
+ + ♻ ☆ Optimal Preconditioning and Fisher Adaptive Langevin Sampling + + +
+ We define an optimal preconditioning for the Langevin diffusion by +analytically optimizing the expected squared jumped distance. This yields as +the optimal preconditioning an inverse Fisher information covariance matrix, +where the covariance matrix is computed as the outer product of log target +gradients averaged under the target. We apply this result to the Metropolis +adjusted Langevin algorithm (MALA) and derive a computationally efficient +adaptive MCMC scheme that learns the preconditioning from the history of +gradients produced as the algorithm runs. We show in several experiments that +the proposed algorithm is very robust in high dimensions and significantly +outperforms other methods, including a closely related adaptive MALA scheme +that learns the preconditioning with standard adaptive MCMC as well as the +position-dependent Riemannian manifold MALA sampler. + +
+
+ comment: 21 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Learning to Reconstruct Signals From Binary Measurements + + +
+ Recent advances in unsupervised learning have highlighted the possibility of +learning to reconstruct signals from noisy and incomplete linear measurements +alone. These methods play a key role in medical and scientific imaging and +sensing, where ground truth data is often scarce or difficult to obtain. +However, in practice, measurements are not only noisy and incomplete but also +quantized. Here we explore the extreme case of learning from binary +observations and provide necessary and sufficient conditions on the number of +measurements required for identifying a set of signals from incomplete binary +data. Our results are complementary to existing bounds on signal recovery from +binary measurements. Furthermore, we introduce a novel self-supervised learning +approach, which we name SSBM, that only requires binary data for training. We +demonstrate in a series of experiments with real datasets that SSBM performs on +par with supervised learning and outperforms sparse reconstruction methods with +a fixed wavelet basis by a large margin. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks on SPD Manifolds for Motor Imagery Classification: + A Perspective from the Time-Frequency Analysis + + +
+ The motor imagery (MI) classification has been a prominent research topic in +brain-computer interfaces based on electroencephalography (EEG). Over the past +few decades, the performance of MI-EEG classifiers has gradually improved. In +this study, we enhance the geometric deep learning classifier for MI-EEG +classification from the perspective of time-frequency analysis, introducing a +new architecture called Graph-CSPNet. We refer to this category of classifiers +as geometric methods, emphasizing their rich background in differential +geometry induced by signal covariance matrices. Graph-CSPNet utilizes a novel +SPD matrix-valued graph convolutional techniques to capture the EEG features in +the time-frequency domain, providing greater flexibility in signal segmentation +and capturing localized fluctuations. To evaluate the effectiveness of +Graph-CSPNet, we employ five commonly-used publicly available MI-EEG datasets, +achieving near-optimal classification accuracies in nine out of eleven +scenarios. The Python repository can be found at +https://github.com/GeometricBCI/Tensor-CSPNet-and-Graph-CSPNet + +
+
+ comment: 15 pages, 5 figures, 6 Tables; This work has been submitted to the + IEEE for possible publication. Copyright may be transferred without notice, + after which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ FocusedCleaner: Sanitizing Poisoned Graphs for Robust GNN-based Node + Classification + + +
+ Graph Neural Networks (GNNs) are vulnerable to data poisoning attacks, which +will generate a poisoned graph as the input to the GNN models. We present +FocusedCleaner as a poisoned graph sanitizer to effectively identify the poison +injected by attackers. Specifically, FocusedCleaner provides a sanitation +framework consisting of two modules: bi-level structural learning and victim +node detection. In particular, the structural learning module will reverse the +attack process to steadily sanitize the graph while the detection module +provides ``the focus" -- a narrowed and more accurate search region -- to +structural learning. These two modules will operate in iterations and reinforce +each other to sanitize a poisoned graph step by step. As an important +application, we show that the adversarial robustness of GNNs trained over the +sanitized graph for the node classification task is significantly improved. +Extensive experiments demonstrate that FocusedCleaner outperforms the +state-of-the-art baselines both on poisoned graph sanitation and improving +robustness. + +
+
+
+
+
+ + ♻ ☆ kHGCN: Tree-likeness Modeling via Continuous and Discrete Curvature + Learning KDD 2023 + + +
+ The prevalence of tree-like structures, encompassing hierarchical structures +and power law distributions, exists extensively in real-world applications, +including recommendation systems, ecosystems, financial networks, social +networks, etc. Recently, the exploitation of hyperbolic space for tree-likeness +modeling has garnered considerable attention owing to its exponential growth +volume. Compared to the flat Euclidean space, the curved hyperbolic space +provides a more amenable and embeddable room, especially for datasets +exhibiting implicit tree-like architectures. However, the intricate nature of +real-world tree-like data presents a considerable challenge, as it frequently +displays a heterogeneous composition of tree-like, flat, and circular regions. +The direct embedding of such heterogeneous structures into a homogeneous +embedding space (i.e., hyperbolic space) inevitably leads to heavy distortions. +To mitigate the aforementioned shortage, this study endeavors to explore the +curvature between discrete structure and continuous learning space, aiming at +encoding the message conveyed by the network topology in the learning process, +thereby improving tree-likeness modeling. To the end, a curvature-aware +hyperbolic graph convolutional neural network, \{kappa}HGCN, is proposed, which +utilizes the curvature to guide message passing and improve long-range +propagation. Extensive experiments on node classification and link prediction +tasks verify the superiority of the proposal as it consistently outperforms +various competitive models by a large margin. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ Fair Diffusion: Instructing Text-to-Image Generation Models on Fairness + + +
+ Generative AI models have recently achieved astonishing results in quality +and are consequently employed in a fast-growing number of applications. +However, since they are highly data-driven, relying on billion-sized datasets +randomly scraped from the internet, they also suffer from degenerated and +biased human behavior, as we demonstrate. In fact, they may even reinforce such +biases. To not only uncover but also combat these undesired effects, we present +a novel strategy, called Fair Diffusion, to attenuate biases after the +deployment of generative text-to-image models. Specifically, we demonstrate +shifting a bias, based on human instructions, in any direction yielding +arbitrarily new proportions for, e.g., identity groups. As our empirical +evaluation demonstrates, this introduced control enables instructing generative +image models on fairness, with no data filtering and additional training +required. + +
+
+
+
+
+ + ♻ ☆ Modulated Neural ODEs + + +
+ Neural ordinary differential equations (NODEs) have been proven useful for +learning non-linear dynamics of arbitrary trajectories. However, current NODE +methods capture variations across trajectories only via the initial state value +or by auto-regressive encoder updates. In this work, we introduce Modulated +Neural ODEs (MoNODEs), a novel framework that sets apart dynamics states from +underlying static factors of variation and improves the existing NODE methods. +In particular, we introduce $\textit{time-invariant modulator variables}$ that +are learned from the data. We incorporate our proposed framework into four +existing NODE variants. We test MoNODE on oscillating systems, videos and human +walking trajectories, where each trajectory has trajectory-specific modulation. +Our framework consistently improves the existing model ability to generalize to +new dynamic parameterizations and to perform far-horizon forecasting. In +addition, we verify that the proposed modulator variables are informative of +the true unknown factors of variation as measured by $R^2$ scores. + +
+
+
+
+
+ + ♻ ☆ Action-based Early Autism Diagnosis Using Contrastive Feature Learning + + +
+ Autism, also known as Autism Spectrum Disorder (or ASD), is a neurological +disorder. Its main symptoms include difficulty in (verbal and/or non-verbal) +communication, and rigid/repetitive behavior. These symptoms are often +indistinguishable from a normal (control) individual, due to which this +disorder remains undiagnosed in early childhood leading to delayed treatment. +Since the learning curve is steep during the initial age, an early diagnosis of +autism could allow to take adequate interventions at the right time, which +might positively affect the growth of an autistic child. Further, the +traditional methods of autism diagnosis require multiple visits to a +specialized psychiatrist, however this process can be time-consuming. In this +paper, we present a learning based approach to automate autism diagnosis using +simple and small action video clips of subjects. This task is particularly +challenging because the amount of annotated data available is small, and the +variations among samples from the two categories (ASD and control) are +generally indistinguishable. This is also evident from poor performance of a +binary classifier learned using the cross-entropy loss on top of a baseline +encoder. To address this, we adopt contrastive feature learning in both self +supervised and supervised learning frameworks, and show that these can lead to +a significant increase in the prediction accuracy of a binary classifier on +this task. We further validate this by conducting thorough experimental +analyses under different set-ups on two publicly available datasets. + +
+
+ comment: This preprint has not undergone peer review (when applicable) or any + postsubmission improvements or corrections. The Version of Record of this + article is published in Multimedia Systems (2023), and is available online at + https://doi.org/10.1007/s00530-023-01132-8 +
+
+
+
+
+ + ♻ ☆ A policy gradient approach for Finite Horizon Constrained Markov + Decision Processes + + +
+ The infinite horizon setting is widely adopted for problems of reinforcement +learning (RL). These invariably result in stationary policies that are optimal. +In many situations, finite horizon control problems are of interest and for +such problems, the optimal policies are time-varying in general. Another +setting that has become popular in recent times is of Constrained Reinforcement +Learning, where the agent maximizes its rewards while it also aims to satisfy +some given constraint criteria. However, this setting has only been studied in +the context of infinite horizon MDPs where stationary policies are optimal. We +present an algorithm for constrained RL in the Finite Horizon Setting where the +horizon terminates after a fixed (finite) time. We use function approximation +in our algorithm which is essential when the state and action spaces are large +or continuous and use the policy gradient method to find the optimal policy. +The optimal policy that we obtain depends on the stage and so is non-stationary +in general. To the best of our knowledge, our paper presents the first policy +gradient algorithm for the finite horizon setting with constraints. We show the +convergence of our algorithm to a constrained optimal policy. We also compare +and analyze the performance of our algorithm through experiments and show that +our algorithm performs better than some other well known algorithms. + +
+
+
+
+
+ + ♻ ☆ A Context-Aware Cutting Plane Selection Algorithm for Mixed-Integer + Programming + + +
+ The current cut selection algorithm used in mixed-integer programming solvers +has remained largely unchanged since its creation. In this paper, we propose a +set of new cut scoring measures, cut filtering techniques, and stopping +criteria, extending the current state-of-the-art algorithm and obtaining a 5\% +performance improvement for SCIP over the MIPLIB 2017 benchmark set. + +
+
+ comment: Added random seeds 4-5 to Table and Figure results +
+
+
+
+
+ + ♻ ☆ A Unified Perspective on Natural Gradient Variational Inference with + Gaussian Mixture Models + + +
+ Variational inference with Gaussian mixture models (GMMs) enables learning of +highly tractable yet multi-modal approximations of intractable target +distributions with up to a few hundred dimensions. The two currently most +effective methods for GMM-based variational inference, VIPS and iBayes-GMM, +both employ independent natural gradient updates for the individual components +and their weights. We show for the first time, that their derived updates are +equivalent, although their practical implementations and theoretical guarantees +differ. We identify several design choices that distinguish both approaches, +namely with respect to sample selection, natural gradient estimation, stepsize +adaptation, and whether trust regions are enforced or the number of components +adapted. We argue that for both approaches, the quality of the learned +approximations can heavily suffer from the respective design choices: By +updating the individual components using samples from the mixture model, +iBayes-GMM often fails to produce meaningful updates to low-weight components, +and by using a zero-order method for estimating the natural gradient, VIPS +scales badly to higher-dimensional problems. Furthermore, we show that +information-geometric trust-regions (used by VIPS) are effective even when +using first-order natural gradient estimates, and often outperform the improved +Bayesian learning rule (iBLR) update used by iBayes-GMM. We systematically +evaluate the effects of design choices and show that a hybrid approach +significantly outperforms both prior works. Along with this work, we publish +our highly modular and efficient implementation for natural gradient +variational inference with Gaussian mixture models, which supports 432 +different combinations of design choices, facilitates the reproduction of all +our experiments, and may prove valuable for the practitioner. + +
+
+ comment: This version corresponds to the camera ready version published at + Transactions of Machine Learning Research (TMLR). + https://openreview.net/forum?id=tLBjsX4tjs +
+
+
+
+
+ + ♻ ☆ Unsupervised pre-training of graph transformers on patient population + graphs + + +
+ Pre-training has shown success in different areas of machine learning, such +as Computer Vision, Natural Language Processing (NLP), and medical imaging. +However, it has not been fully explored for clinical data analysis. An immense +amount of clinical records are recorded, but still, data and labels can be +scarce for data collected in small hospitals or dealing with rare diseases. In +such scenarios, pre-training on a larger set of unlabelled clinical data could +improve performance. In this paper, we propose novel unsupervised pre-training +techniques designed for heterogeneous, multi-modal clinical data for patient +outcome prediction inspired by masked language modeling (MLM), by leveraging +graph deep learning over population graphs. To this end, we further propose a +graph-transformer-based network, designed to handle heterogeneous clinical +data. By combining masking-based pre-training with a transformer-based network, +we translate the success of masking-based pre-training in other domains to +heterogeneous clinical data. We show the benefit of our pre-training method in +a self-supervised and a transfer learning setting, utilizing three medical +datasets TADPOLE, MIMIC-III, and a Sepsis Prediction Dataset. We find that our +proposed pre-training methods help in modeling the data at a patient and +population level and improve performance in different fine-tuning tasks on all +datasets. + +
+
+ comment: accepted for publication at the Medical Image Analysis Journal: + https://www.sciencedirect.com/science/article/abs/pii/S136184152300155X. 20 + pages, 3 figures, 20 tables +
+
+
+
+
+ + ♻ ☆ Approximating Pandora's Box with Correlations + + +
+ We revisit the classic Pandora's Box (PB) problem under correlated +distributions on the box values. Recent work of arXiv:1911.01632 obtained +constant approximate algorithms for a restricted class of policies for the +problem that visit boxes in a fixed order. In this work, we study the +complexity of approximating the optimal policy which may adaptively choose +which box to visit next based on the values seen so far. + Our main result establishes an approximation-preserving equivalence of PB to +the well studied Uniform Decision Tree (UDT) problem from stochastic +optimization and a variant of the Min-Sum Set Cover ($\text{MSSC}_f$) problem. +For distributions of support $m$, UDT admits a $\log m$ approximation, and +while a constant factor approximation in polynomial time is a long-standing +open problem, constant factor approximations are achievable in subexponential +time (arXiv:1906.11385). Our main result implies that the same properties hold +for PB and $\text{MSSC}_f$. + We also study the case where the distribution over values is given more +succinctly as a mixture of $m$ product distributions. This problem is again +related to a noisy variant of the Optimal Decision Tree which is significantly +more challenging. We give a constant-factor approximation that runs in time +$n^{ \tilde O( m^2/\varepsilon^2 ) }$ when the mixture components on every box +are either identical or separated in TV distance by $\varepsilon$. + +
+
+
+
+
+ + ♻ ☆ 3D-aware Blending with Generative NeRFs ICCV 2023 + + +
+ Image blending aims to combine multiple images seamlessly. It remains +challenging for existing 2D-based methods, especially when input images are +misaligned due to differences in 3D camera poses and object shapes. To tackle +these issues, we propose a 3D-aware blending method using generative Neural +Radiance Fields (NeRF), including two key components: 3D-aware alignment and +3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of +the reference image with respect to generative NeRFs and then perform 3D local +alignment for each part. To further leverage 3D information of the generative +NeRF, we propose 3D-aware blending that directly blends images on the NeRF's +latent representation space, rather than raw pixel space. Collectively, our +method outperforms existing 2D baselines, as validated by extensive +quantitative and qualitative evaluations with FFHQ and AFHQ-Cat. + +
+
+ comment: ICCV 2023, Project page: https://blandocs.github.io/blendnerf +
+
+
+
+
+ + ♻ ☆ Information-Directed Selection for Top-Two Algorithms COLT + + +
+ We consider the best-k-arm identification problem for multi-armed bandits, +where the objective is to select the exact set of k arms with the highest mean +rewards by sequentially allocating measurement effort. We characterize the +necessary and sufficient conditions for the optimal allocation using dual +variables. Remarkably these optimality conditions lead to the extension of +top-two algorithm design principle (Russo, 2020), initially proposed for +best-arm identification. Furthermore, our optimality conditions induce a simple +and effective selection rule dubbed information-directed selection (IDS) that +selects one of the top-two candidates based on a measure of information gain. +As a theoretical guarantee, we prove that integrated with IDS, top-two Thompson +sampling is (asymptotically) optimal for Gaussian best-arm identification, +solving a glaring open problem in the pure exploration literature (Russo, +2020). As a by-product, we show that for k > 1, top-two algorithms cannot +achieve optimality even when the algorithm has access to the unknown "optimal" +tuning parameter. Numerical experiments show the superior performance of the +proposed top-two algorithms with IDS and considerable improvement compared with +algorithms without adaptive selection. + +
+
+ comment: Accepted for presentation at the Conference on Learning Theory (COLT) + 2023 +
+
+
+
+
+ + ♻ ☆ StitchNet: Composing Neural Networks from Pre-Trained Fragments + + +
+ We propose StitchNet, a novel neural network creation paradigm that stitches +together fragments (one or more consecutive network layers) from multiple +pre-trained neural networks. StitchNet allows the creation of high-performing +neural networks without the large compute and data requirements needed under +traditional model creation processes via backpropagation training. We leverage +Centered Kernel Alignment (CKA) as a compatibility measure to efficiently guide +the selection of these fragments in composing a network for a given task +tailored to specific accuracy needs and computing resource constraints. We then +show that these fragments can be stitched together to create neural networks +with comparable accuracy to traditionally trained networks at a fraction of +computing resource and data requirements. Finally, we explore a novel +on-the-fly personalized model creation and inference application enabled by +this new paradigm. + +
+
+
+
+
+ + ♻ ☆ Beyond Intuition, a Framework for Applying GPs to Real-World Data ICML + + +
+ Gaussian Processes (GPs) offer an attractive method for regression over +small, structured and correlated datasets. However, their deployment is +hindered by computational costs and limited guidelines on how to apply GPs +beyond simple low-dimensional datasets. We propose a framework to identify the +suitability of GPs to a given problem and how to set up a robust and +well-specified GP model. The guidelines formalise the decisions of experienced +GP practitioners, with an emphasis on kernel design and options for +computational scalability. The framework is then applied to a case study of +glacier elevation change yielding more accurate results at test time. + +
+
+ comment: Accepted at the ICML Workshop on Structured Probabilistic Inference + and Generative Modelling (2023) +
+
+
+
+
+ + ♻ ☆ PIGNet2: A Versatile Deep Learning-based Protein-Ligand Interaction + Prediction Model for Binding Affinity Scoring and Virtual Screening + + +
+ Prediction of protein-ligand interactions (PLI) plays a crucial role in drug +discovery as it guides the identification and optimization of molecules that +effectively bind to target proteins. Despite remarkable advances in deep +learning-based PLI prediction, the development of a versatile model capable of +accurately scoring binding affinity and conducting efficient virtual screening +remains a challenge. The main obstacle in achieving this lies in the scarcity +of experimental structure-affinity data, which limits the generalization +ability of existing models. Here, we propose a viable solution to address this +challenge by introducing a novel data augmentation strategy combined with a +physics-informed graph neural network. The model showed significant +improvements in both scoring and screening, outperforming task-specific deep +learning models in various tests including derivative benchmarks, and notably +achieving results comparable to the state-of-the-art performance based on +distance likelihood learning. This demonstrates the potential of this approach +to drug discovery. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Federated Learning of Gboard Language Models with Differential Privacy ACL + + +
+ We train language models (LMs) with federated learning (FL) and differential +privacy (DP) in the Google Keyboard (Gboard). We apply the +DP-Follow-the-Regularized-Leader (DP-FTRL)~\citep{kairouz21b} algorithm to +achieve meaningfully formal DP guarantees without requiring uniform sampling of +client devices. To provide favorable privacy-utility trade-offs, we introduce a +new client participation criterion and discuss the implication of its +configuration in large scale systems. We show how quantile-based clip +estimation~\citep{andrew2019differentially} can be combined with DP-FTRL to +adaptively choose the clip norm during training or reduce the hyperparameter +tuning in preparation for training. With the help of pretraining on public +data, we train and deploy more than twenty Gboard LMs that achieve high utility +and $\rho-$zCDP privacy guarantees with $\rho \in (0.2, 2)$, with two models +additionally trained with secure aggregation~\citep{bonawitz2017practical}. We +are happy to announce that all the next word prediction neural network LMs in +Gboard now have DP guarantees, and all future launches of Gboard neural network +LMs will require DP guarantees. We summarize our experience and provide +concrete suggestions on DP training for practitioners. + +
+
+ comment: ACL industry track; v2 updating SecAgg details +
+
+
+
+
+ + ♻ ☆ Ref-NeuS: Ambiguity-Reduced Neural Implicit Surface Learning for + Multi-View Reconstruction with Reflection ICCV 2023 + + +
+ Neural implicit surface learning has shown significant progress in multi-view +3D reconstruction, where an object is represented by multilayer perceptrons +that provide continuous implicit surface representation and view-dependent +radiance. However, current methods often fail to accurately reconstruct +reflective surfaces, leading to severe ambiguity. To overcome this issue, we +propose Ref-NeuS, which aims to reduce ambiguity by attenuating the effect of +reflective surfaces. Specifically, we utilize an anomaly detector to estimate +an explicit reflection score with the guidance of multi-view context to +localize reflective surfaces. Afterward, we design a reflection-aware +photometric loss that adaptively reduces ambiguity by modeling rendered color +as a Gaussian distribution, with the reflection score representing the +variance. We show that together with a reflection direction-dependent radiance, +our model achieves high-quality surface reconstruction on reflective surfaces +and outperforms the state-of-the-arts by a large margin. Besides, our model is +also comparable on general surfaces. + +
+
+ comment: ICCV 2023, Project webpage: https://g3956.github.io/ +
+
+
+
+
+ + ♻ ☆ Non-Stationary Bandit Learning via Predictive Sampling + + +
+ Thompson sampling has proven effective across a wide range of stationary +bandit environments. However, as we demonstrate in this paper, it can perform +poorly when applied to non-stationary environments. We attribute such failures +to the fact that, when exploring, the algorithm does not differentiate actions +based on how quickly the information acquired loses its usefulness due to +non-stationarity. Building upon this insight, we propose predictive sampling, +an algorithm that deprioritizes acquiring information that quickly loses +usefulness. A theoretical guarantee on the performance of predictive sampling +is established through a Bayesian regret bound. We provide versions of +predictive sampling for which computations tractably scale to complex bandit +environments of practical interest. Through numerical simulations, we +demonstrate that predictive sampling outperforms Thompson sampling in all +non-stationary environments examined. + +
+
+
+
+
+ + ♻ ☆ Systematic Literature Review on Application of Machine Learning in + Continuous Integration + + +
+ This research conducted a systematic review of the literature on machine +learning (ML)-based methods in the context of Continuous Integration (CI) over +the past 22 years. The study aimed to identify and describe the techniques used +in ML-based solutions for CI and analyzed various aspects such as data +engineering, feature engineering, hyper-parameter tuning, ML models, evaluation +methods, and metrics. In this paper, we have depicted the phases of CI testing, +the connection between them, and the employed techniques in training the ML +method phases. We presented nine types of data sources and four taken steps in +the selected studies for preparing the data. Also, we identified four feature +types and nine subsets of data features through thematic analysis of the +selected studies. Besides, five methods for selecting and tuning the +hyper-parameters are shown. In addition, we summarised the evaluation methods +used in the literature and identified fifteen different metrics. The most +commonly used evaluation methods were found to be precision, recall, and +F1-score, and we have also identified five methods for evaluating the +performance of trained ML models. Finally, we have presented the relationship +between ML model types, performance measurements, and CI phases. The study +provides valuable insights for researchers and practitioners interested in +ML-based methods in CI and emphasizes the need for further research in this +area. + +
+
+ comment: This paper got a rejection and we need to address the comments and + upload the new version with new results +
+
+
+
+
+ + ♻ ☆ Semi-supervised cross-lingual speech emotion recognition + + +
+ Performance in Speech Emotion Recognition (SER) on a single language has +increased greatly in the last few years thanks to the use of deep learning +techniques. However, cross-lingual SER remains a challenge in real-world +applications due to two main factors: the first is the big gap among the source +and the target domain distributions; the second factor is the major +availability of unlabeled utterances in contrast to the labeled ones for the +new language. Taking into account previous aspects, we propose a +Semi-Supervised Learning (SSL) method for cross-lingual emotion recognition +when only few labeled examples in the target domain (i.e. the new language) are +available. Our method is based on a Transformer and it adapts to the new domain +by exploiting a pseudo-labeling strategy on the unlabeled utterances. In +particular, the use of a hard and soft pseudo-labels approach is investigated. +We thoroughly evaluate the performance of the proposed method in a +speaker-independent setup on both the source and the new language and show its +robustness across five languages belonging to different linguistic strains. The +experimental findings indicate that the unweighted accuracy is increased by an +average of 40% compared to state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ CleanCLIP: Mitigating Data Poisoning Attacks in Multimodal Contrastive + Learning ICCV 2023 + + +
+ Multimodal contrastive pretraining has been used to train multimodal +representation models, such as CLIP, on large amounts of paired image-text +data. However, previous studies have revealed that such models are vulnerable +to backdoor attacks. Specifically, when trained on backdoored examples, CLIP +learns spurious correlations between the embedded backdoor trigger and the +target label, aligning their representations in the joint embedding space. +Injecting even a small number of poisoned examples, such as 75 examples in 3 +million pretraining data, can significantly manipulate the model's behavior, +making it difficult to detect or unlearn such correlations. To address this +issue, we propose CleanCLIP, a finetuning framework that weakens the learned +spurious associations introduced by backdoor attacks by independently +re-aligning the representations for individual modalities. We demonstrate that +unsupervised finetuning using a combination of multimodal contrastive and +unimodal self-supervised objectives for individual modalities can significantly +reduce the impact of the backdoor attack. Additionally, we show that supervised +finetuning on task-specific labeled image data removes the backdoor trigger +from the CLIP vision encoder. We show empirically that CleanCLIP maintains +model performance on benign examples while erasing a range of backdoor attacks +on multimodal contrastive learning. The code and checkpoints are available at +https://github.com/nishadsinghi/CleanCLIP. + +
+
+ comment: 22 pages. Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ ECS -- an Interactive Tool for Data Quality Assurance + + +
+ With the increasing capabilities of machine learning systems and their +potential use in safety-critical systems, ensuring high-quality data is +becoming increasingly important. In this paper we present a novel approach for +the assurance of data quality. For this purpose, the mathematical basics are +first discussed and the approach is presented using multiple examples. This +results in the detection of data points with potentially harmful properties for +the use in safety-critical systems. + +
+
+
+
+
+ + ♻ ☆ DoubleAdapt: A Meta-learning Approach to Incremental Learning for Stock + Trend Forecasting KDD 2023 + + +
+ Stock trend forecasting is a fundamental task of quantitative investment +where precise predictions of price trends are indispensable. As an online +service, stock data continuously arrive over time. It is practical and +efficient to incrementally update the forecast model with the latest data which +may reveal some new patterns recurring in the future stock market. However, +incremental learning for stock trend forecasting still remains under-explored +due to the challenge of distribution shifts (a.k.a. concept drifts). With the +stock market dynamically evolving, the distribution of future data can slightly +or significantly differ from incremental data, hindering the effectiveness of +incremental updates. To address this challenge, we propose DoubleAdapt, an +end-to-end framework with two adapters, which can effectively adapt the data +and the model to mitigate the effects of distribution shifts. Our key insight +is to automatically learn how to adapt stock data into a locally stationary +distribution in favor of profitable updates. Complemented by data adaptation, +we can confidently adapt the model parameters under mitigated distribution +shifts. We cast each incremental learning task as a meta-learning task and +automatically optimize the adapters for desirable data adaptation and parameter +initialization. Experiments on real-world stock datasets demonstrate that +DoubleAdapt achieves state-of-the-art predictive performance and shows +considerable efficiency. + +
+
+ comment: Accepted by KDD 2023 +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Learning: the Case When Unlabeled Data is Equally Useful UAI 2020 + + +
+ Semi-supervised learning algorithms attempt to take advantage of relatively +inexpensive unlabeled data to improve learning performance. In this work, we +consider statistical models where the data distributions can be characterized +by continuous parameters. We show that under certain conditions on the +distribution, unlabeled data is equally useful as labeled date in terms of +learning rate. Specifically, let $n, m$ be the number of labeled and unlabeled +data, respectively. It is shown that the learning rate of semi-supervised +learning scales as $O(1/n)$ if $m\sim n$, and scales as $O(1/n^{1+\gamma})$ if +$m\sim n^{1+\gamma}$ for some $\gamma>0$, whereas the learning rate of +supervised learning scales as $O(1/n)$. + +
+
+ comment: Published in UAI 2020. This version: an error in Lemma 2 is corrected +
+
+
+
+
+ + ♻ ☆ Active Learning for Single Neuron Models with Lipschitz Non-Linearities + + +
+ We consider the problem of active learning for single neuron models, also +sometimes called ``ridge functions'', in the agnostic setting (under +adversarial label noise). Such models have been shown to be broadly effective +in modeling physical phenomena, and for constructing surrogate data-driven +models for partial differential equations. + Surprisingly, we show that for a single neuron model with any Lipschitz +non-linearity (such as the ReLU, sigmoid, absolute value, low-degree +polynomial, among others), strong provable approximation guarantees can be +obtained using a well-known active learning strategy for fitting \emph{linear +functions} in the agnostic setting. % -- i.e. for the case when there is no +non-linearity. Namely, we can collect samples via statistical \emph{leverage +score sampling}, which has been shown to be near-optimal in other active +learning scenarios. We support our theoretical results with empirical +simulations showing that our proposed active learning strategy based on +leverage score sampling outperforms (ordinary) uniform sampling when fitting +single neuron models. + +
+
+ comment: Inadvertently submitting an incorrect writeup that does not align + with the intended content +
+
+
+
+
+ + ♻ ☆ Feature representations useful for predicting image memorability + + +
+ Prediction of image memorability has attracted interest in various fields. +Consequently, the prediction accuracy of convolutional neural network (CNN) +models has been approaching the empirical upper bound estimated based on human +consistency. However, identifying which feature representations embedded in CNN +models are responsible for the high memorability prediction accuracy remains an +open question. To tackle this problem, we sought to identify +memorability-related feature representations in CNN models using brain +similarity. Specifically, memorability prediction accuracy and brain similarity +were examined across 16,860 layers in 64 CNN models pretrained for object +recognition. A clear tendency was observed in this comprehensive analysis that +layers with high memorability prediction accuracy had higher brain similarity +with the inferior temporal (IT) cortex, which is the highest stage in the +ventral visual pathway. Furthermore, fine-tuning of the 64 CNN models for +memorability prediction revealed that brain similarity with the IT cortex at +the penultimate layer positively correlated with the memorability prediction +accuracy of the models. This analysis also showed that the best fine-tuned +model provided accuracy comparable to state-of-the-art CNN models developed for +memorability prediction. Overall, the results of this study indicated that the +CNN models' great success in predicting memorability relies on feature +representation acquisition, similar to the IT cortex. This study advances our +understanding of feature representations and their use in predicting image +memorability. + +
+
+
+
+
+ + ♻ ☆ Learning Quantum Processes and Hamiltonians via the Pauli Transfer + Matrix + + +
+ Learning about physical systems from quantum-enhanced experiments, relying on +a quantum memory and quantum processing, can outperform learning from +experiments in which only classical memory and processing are available. +Whereas quantum advantages have been established for a variety of state +learning tasks, quantum process learning allows for comparable advantages only +with a careful problem formulation and is less understood. We establish an +exponential quantum advantage for learning an unknown $n$-qubit quantum process +$\mathcal{N}$. We show that a quantum memory allows to efficiently solve the +following tasks: (a) learning the Pauli transfer matrix of an arbitrary +$\mathcal{N}$, (b) predicting expectation values of bounded Pauli-sparse +observables measured on the output of an arbitrary $\mathcal{N}$ upon input of +a Pauli-sparse state, and (c) predicting expectation values of arbitrary +bounded observables measured on the output of an unknown $\mathcal{N}$ with +sparse Pauli transfer matrix upon input of an arbitrary state. With quantum +memory, these tasks can be solved using linearly-in-$n$ many copies of the Choi +state of $\mathcal{N}$, and even time-efficiently in the case of (b). In +contrast, any learner without quantum memory requires exponentially-in-$n$ many +queries, even when querying $\mathcal{N}$ on subsystems of adaptively chosen +states and performing adaptively chosen measurements. In proving this +separation, we extend existing shadow tomography upper and lower bounds from +states to channels via the Choi-Jamiolkowski isomorphism. Moreover, we combine +Pauli transfer matrix learning with polynomial interpolation techniques to +develop a procedure for learning arbitrary Hamiltonians, which may have +non-local all-to-all interactions, from short-time dynamics. Our results +highlight the power of quantum-enhanced experiments for learning highly complex +quantum dynamics. + +
+
+ comment: 30+31 pages, 2+1 figures; V2 includes small corrections to Remark 4.3 + and Lemma 6.1 as well improvements to the presentation +
+
+
+
+
+ + ♻ ☆ Towards Fair Disentangled Online Learning for Changing Environments KDD 2023 + + +
+ In the problem of online learning for changing environments, data are +sequentially received one after another over time, and their distribution +assumptions may vary frequently. Although existing methods demonstrate the +effectiveness of their learning algorithms by providing a tight bound on either +dynamic regret or adaptive regret, most of them completely ignore learning with +model fairness, defined as the statistical parity across different +sub-population (e.g., race and gender). Another drawback is that when adapting +to a new environment, an online learner needs to update model parameters with a +global change, which is costly and inefficient. Inspired by the sparse +mechanism shift hypothesis, we claim that changing environments in online +learning can be attributed to partial changes in learned parameters that are +specific to environments and the rest remain invariant to changing +environments. To this end, in this paper, we propose a novel algorithm under +the assumption that data collected at each time can be disentangled with two +representations, an environment-invariant semantic factor and an +environment-specific variation factor. The semantic factor is further used for +fair prediction under a group fairness constraint. To evaluate the sequence of +model parameters generated by the learner, a novel regret is proposed in which +it takes a mixed form of dynamic and static regret metrics followed by a +fairness-aware long-term constraint. The detailed analysis provides theoretical +guarantees for loss regret and violation of cumulative fairness constraints. +Empirical evaluations on real-world datasets demonstrate our proposed method +sequentially outperforms baseline methods in model accuracy and fairness. + +
+
+ comment: Accepted by KDD 2023 +
+
+
+
+
+ + ♻ ☆ Free-Form Variational Inference for Gaussian Process State-Space Models + + +
+ Gaussian process state-space models (GPSSMs) provide a principled and +flexible approach to modeling the dynamics of a latent state, which is observed +at discrete-time points via a likelihood model. However, inference in GPSSMs is +computationally and statistically challenging due to the large number of latent +variables in the model and the strong temporal dependencies between them. In +this paper, we propose a new method for inference in Bayesian GPSSMs, which +overcomes the drawbacks of previous approaches, namely over-simplified +assumptions, and high computational requirements. Our method is based on +free-form variational inference via stochastic gradient Hamiltonian Monte Carlo +within the inducing-variable formalism. Furthermore, by exploiting our proposed +variational distribution, we provide a collapsed extension of our method where +the inducing variables are marginalized analytically. We also showcase results +when combining our framework with particle MCMC methods. We show that, on six +real-world datasets, our approach can learn transition dynamics and latent +states more accurately than competing methods. + +
+
+ comment: Updating to final version to appear in the proceedings +
+
+
+
+
+ + ♻ ☆ Hybrid hidden Markov LSTM for short-term traffic flow prediction + + +
+ Deep learning (DL) methods have outperformed parametric models such as +historical average, ARIMA and variants in predicting traffic variables into +short and near-short future, that are critical for traffic management. +Specifically, recurrent neural network (RNN) and its variants (e.g. long +short-term memory) are designed to retain long-term temporal correlations and +therefore are suitable for modeling sequences. However, multi-regime models +assume the traffic system to evolve through multiple states (say, free-flow, +congestion in traffic) with distinct characteristics, and hence, separate +models are trained to characterize the traffic dynamics within each regime. For +instance, Markov-switching models with a hidden Markov model (HMM) for regime +identification is capable of capturing complex dynamic patterns and +non-stationarity. Interestingly, both HMM and LSTM can be used for modeling an +observation sequence from a set of latent or, hidden state variables. In LSTM, +the latent variable is computed in a deterministic manner from the current +observation and the previous latent variable, while, in HMM, the set of latent +variables is a Markov chain. Inspired by research in natural language +processing, a hybrid hidden Markov-LSTM model that is capable of learning +complementary features in traffic data is proposed for traffic flow prediction. +Results indicate significant performance gains in using hybrid architecture +compared to conventional methods such as Markov switching ARIMA and LSTM. + +
+
+
+
+
+ + ♻ ☆ Newell's theory based feature transformations for spatio-temporal + traffic prediction + + +
+ Deep learning (DL) models for spatio-temporal traffic flow forecasting employ +convolutional or graph-convolutional filters along with recurrent neural +networks to capture spatial and temporal dependencies in traffic data. These +models, such as CNN-LSTM, utilize traffic flows from neighboring detector +stations to predict flows at a specific location of interest. However, these +models are limited in their ability to capture the broader dynamics of the +traffic system, as they primarily learn features specific to the detector +configuration and traffic characteristics at the target location. Hence, the +transferability of these models to different locations becomes challenging, +particularly when data is unavailable at the new location for model training. +To address this limitation, we propose a traffic flow physics-based feature +transformation for spatio-temporal DL models. This transformation incorporates +Newell's uncongested and congested-state estimators of traffic flows at the +target locations, enabling the models to learn broader dynamics of the system. +Our methodology is empirically validated using traffic data from two different +locations. The results demonstrate that the proposed feature transformation +improves the models' performance in predicting traffic flows over different +prediction horizons, as indicated by better goodness-of-fit statistics. An +important advantage of our framework is its ability to be transferred to new +locations where data is unavailable. This is achieved by appropriately +accounting for spatial dependencies based on station distances and various +traffic parameters. In contrast, regular DL models are not easily transferable +as their inputs remain fixed. It should be noted that due to data limitations, +we were unable to perform spatial sensitivity analysis, which calls for further +research using simulated data. + +
+
+
+
+
+ + ♻ ☆ Provable Multi-Task Representation Learning by Two-Layer ReLU Neural + Networks + + +
+ Feature learning, i.e. extracting meaningful representations of data, is +quintessential to the practical success of neural networks trained with +gradient descent, yet it is notoriously difficult to explain how and why it +occurs. Recent theoretical studies have shown that shallow neural networks +optimized on a single task with gradient-based methods can learn meaningful +features, extending our understanding beyond the neural tangent kernel or +random feature regime in which negligible feature learning occurs. But in +practice, neural networks are increasingly often trained on {\em many} tasks +simultaneously with differing loss functions, and these prior analyses do not +generalize to such settings. In the multi-task learning setting, a variety of +studies have shown effective feature learning by simple linear models. However, +multi-task learning via {\em nonlinear} models, arguably the most common +learning paradigm in practice, remains largely mysterious. In this work, we +present the first results proving feature learning occurs in a multi-task +setting with a nonlinear model. We show that when the tasks are binary +classification problems with labels depending on only $r$ directions within the +ambient $d\gg r$-dimensional input space, executing a simple gradient-based +multitask learning algorithm on a two-layer ReLU neural network learns the +ground-truth $r$ directions. In particular, any downstream task on the $r$ +ground-truth coordinates can be solved by learning a linear classifier with +sample and neuron complexity independent of the ambient dimension $d$, while a +random feature model requires exponential complexity in $d$ for such a +guarantee. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Embedding Quality Evaluation ICML + + +
+ Unsupervised learning has recently significantly gained in popularity, +especially with deep learning-based approaches. Despite numerous successes and +approaching supervised-level performance on a variety of academic benchmarks, +it is still hard to train and evaluate SSL models in practice due to the +unsupervised nature of the problem. Even with networks trained in a supervised +fashion, it is often unclear whether they will perform well when transferred to +another domain. + Past works are generally limited to assessing the amount of information +contained in embeddings, which is most relevant for self-supervised learning of +deep neural networks. This works chooses to follow a different approach: can we +quantify how easy it is to linearly separate the data in a stable way? We +survey the literature and uncover three methods that could be potentially used +for evaluating quality of representations. We also introduce one novel method +based on recent advances in understanding the high-dimensional geometric +structure of self-supervised learning. + We conduct extensive experiments and study the properties of these metrics +and ones introduced in the previous work. Our results suggest that while there +is no free lunch, there are metrics that can robustly estimate embedding +quality in an unsupervised way. + +
+
+ comment: As appeared at the 2nd Annual Workshop on Topology, Algebra, and + Geometry in Machine Learning (TAG-ML) at the 40th International Conference on + Machine Learning (ICML), Honolulu, Hawaii, USA. 2023 +
+
+
+
+
+ + ♻ ☆ Efficient Large-Scale Visual Representation Learning And Evaluation + + +
+ In this article, we present our approach to single-modality visual +representation learning. Understanding visual representations of items is vital +for fashion recommendations in e-commerce. We detail and contrast techniques +used to finetune large-scale visual representation learning models in an +efficient manner under low-resource settings, including several pretrained +backbone architectures, both in the convolutional neural network as well as the +vision transformer family. We describe the challenges for e-commerce +applications at-scale and highlight the efforts to more efficiently train, +evaluate, and serve visual representations. We present ablation studies +evaluating the representation offline performance for several downstream tasks, +including visually similar ad recommendations on mobile devices. To this end, +we present a novel multilingual text-to-image generative offline evaluation +method for visually similar recommendation systems. Finally, we include online +results from deployed machine learning systems in production at Etsy. + +
+
+
+
+
+ + ♻ ☆ Formulating A Strategic Plan Based On Statistical Analyses And + Applications For Financial Companies Through A Real-World Use Case + + +
+ Business statistics play a crucial role in implementing a data-driven +strategic plan at the enterprise level to employ various analytics where the +outcomes of such a plan enable an enterprise to enhance the decision-making +process or to mitigate risks to the organization. In this work, a strategic +plan informed by the statistical analysis is introduced for a financial company +called LendingClub, where the plan is comprised of exploring the possibility of +onboarding a big data platform along with advanced feature selection +capacities. The main objectives of such a plan are to increase the company's +revenue while reducing the risks of granting loans to borrowers who cannot +return their loans. In this study, different hypotheses formulated to address +the company's concerns are studied, where the results reveal that the amount of +loans profoundly impacts the number of borrowers charging off their loans. +Also, the proposed strategic plan includes onboarding advanced analytics such +as machine learning technologies that allow the company to build better +generalized data-driven predictive models. + +
+
+
+
+
+ + ♻ ☆ Towards Dynamic Causal Discovery with Rare Events: A Nonparametric + Conditional Independence Test + + +
+ Causal phenomena associated with rare events occur across a wide range of +engineering problems, such as risk-sensitive safety analysis, accident analysis +and prevention, and extreme value theory. However, current methods for causal +discovery are often unable to uncover causal links, between random variables in +a dynamic setting, that manifest only when the variables first experience +low-probability realizations. To address this issue, we introduce a novel +statistical independence test on data collected from time-invariant dynamical +systems in which rare but consequential events occur. In particular, we exploit +the time-invariance of the underlying data to construct a superimposed dataset +of the system state before rare events happen at different timesteps. We then +design a conditional independence test on the reorganized data. We provide +non-asymptotic sample complexity bounds for the consistency of our method, and +validate its performance across various simulated and real-world datasets, +including incident data collected from the Caltrans Performance Measurement +System (PeMS). Code containing the datasets and experiments is publicly +available. + +
+
+
+
+
+ + ♻ ☆ Edit at your own risk: evaluating the robustness of edited models to + distribution shifts + + +
+ The current trend toward ever-larger models makes standard retraining +procedures an ever-more expensive burden. For this reason, there is growing +interest in model editing, which enables computationally inexpensive, +interpretable, post-hoc model modifications. While many model editing +techniques are promising, research on the properties of edited models is +largely limited to evaluation of validation accuracy. The robustness of edited +models is an important and yet mostly unexplored topic. In this paper, we +employ recently developed techniques from the field of deep learning robustness +to investigate both how model editing affects the general robustness of a +model, as well as the robustness of the specific behavior targeted by the edit. +We find that edits tend to reduce general robustness, but that the degree of +degradation depends on the editing algorithm and layers chosen. Motivated by +these observations we introduce a new model editing algorithm, 1-layer +interpolation (1-LI), which uses weight-space interpolation to navigate the +trade-off between editing task accuracy and general robustness. + +
+
+ comment: DB and CG contributed equally +
+
+
+
+
+ + ♻ Multi-Objective GFlowNets ICML 2023 + + +
+ We study the problem of generating diverse candidates in the context of +Multi-Objective Optimization. In many applications of machine learning such as +drug discovery and material design, the goal is to generate candidates which +simultaneously optimize a set of potentially conflicting objectives. Moreover, +these objectives are often imperfect evaluations of some underlying property of +interest, making it important to generate diverse candidates to have multiple +options for expensive downstream evaluations. We propose Multi-Objective +GFlowNets (MOGFNs), a novel method for generating diverse Pareto optimal +solutions, based on GFlowNets. We introduce two variants of MOGFNs: MOGFN-PC, +which models a family of independent sub-problems defined by a scalarization +function, with reward-conditional GFlowNets, and MOGFN-AL, which solves a +sequence of sub-problems defined by an acquisition function in an active +learning loop. Our experiments on wide variety of synthetic and benchmark tasks +demonstrate advantages of the proposed methods in terms of the Pareto +performance and importantly, improved candidate diversity, which is the main +contribution of this work. + +
+
+ comment: 23 pages, 8 figures. ICML 2023. Code at: + https://github.com/GFNOrg/multi-objective-gfn +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection SC 2023 + + +
+ Accurate and robust object detection is critical for autonomous driving. +Image-based detectors face difficulties caused by low visibility in adverse +weather conditions. Thus, radar-camera fusion is of particular interest but +presents challenges in optimally fusing heterogeneous data sources. To approach +this issue, we propose two new radar preprocessing techniques to better align +radar and camera data. In addition, we introduce a Multi-Task Cross-Modality +Attention-Fusion Network (MCAF-Net) for object detection, which includes two +new fusion blocks. These allow for exploiting information from the feature maps +more comprehensively. The proposed algorithm jointly detects objects and +segments free space, which guides the model to focus on the more relevant part +of the scene, namely, the occupied space. Our approach outperforms current +state-of-the-art radar-camera fusion-based object detectors in the nuScenes +dataset and achieves more robust results in adverse weather conditions and +nighttime scenarios. + +
+
+ comment: Accepted by ITSC 2023 +
+
+
+
+
+ + ☆ Power-Efficient Video Streaming on Mobile Devices Using Optimal Spatial + Scaling + + +
+ This paper derives optimal spatial scaling and rate control parameters for +power-efficient wireless video streaming on portable devices. A video streaming +application is studied, which receives a high-resolution and high-quality video +stream from a remote server and displays the content to the end-user.We show +that the resolution of the input video can be adjusted such that the +quality-power trade-off is optimized. Making use of a power model from the +literature and subjective quality evaluation using a perceptual metric, we +derive optimal combinations of the scaling factor and the rate-control +parameter for encoding. For HD sequences, up to 10% of power can be saved at +negligible quality losses and up to 15% of power can be saved at tolerable +distortions. To show general validity, the method was tested for Wi-Fi and a +mobile network as well as for two different smartphones. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ SuS-X: Training-Free Name-Only Transfer of Vision-Language Models ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet +effective way to train large-scale vision-language models. CLIP demonstrates +impressive zero-shot classification and retrieval on diverse downstream tasks. +However, to leverage its full potential, fine-tuning still appears to be +necessary. Fine-tuning the entire CLIP model can be resource-intensive and +unstable. Moreover, recent methods that aim to circumvent this need for +fine-tuning still require access to images from the target distribution. In +this paper, we pursue a different approach and explore the regime of +training-free "name-only transfer" in which the only knowledge we possess about +the downstream task comprises the names of downstream target categories. We +propose a novel method, SuS-X, consisting of two key building blocks -- SuS and +TIP-X, that requires neither intensive fine-tuning nor costly labelled data. +SuS-X achieves state-of-the-art zero-shot classification results on 19 +benchmark datasets. We further show the utility of TIP-X in the training-free +few-shot setting, where we again achieve state-of-the-art results over strong +training-free baselines. Code is available at +https://github.com/vishaal27/SuS-X. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ MAViL: Masked Audio-Video Learners + + +
+ We present Masked Audio-Video Learners (MAViL) to train audio-visual +representations. Our approach learns with three complementary forms of +self-supervision: (1) reconstruction of masked audio and video input data, (2) +intra- and inter-modal contrastive learning with masking, and (3) self-training +by reconstructing joint audio-video contextualized features learned from the +first two objectives. Pre-training with MAViL not only enables the model to +perform well in audio-visual classification and retrieval tasks but also +improves representations of each modality in isolation, without using +information from the other modality for fine-tuning or inference. Empirically, +MAViL sets a new state-of-the-art on AudioSet (53.1 mAP) and VGGSound (67.1% +accuracy). For the first time, a self-supervised audio-visual model outperforms +ones that use external supervision on these benchmarks. + +
+
+ comment: Technical report +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`